Package Loading and Data Importing

Load Packages and Custom Functions

# BiocManager::install("mixOmicsTeam/mixOmics")
# remotes::install_github("ying14/yingtools2")
# remotes::install_github("mikabr/ggpirate")
# devtools::install_github("EmilHvitfeldt/paletteer")

library(pacman)
p_load(
  factoextra,               # Dimension reduction
  FactoMineR,               # Dimension reduction
  stabs,                    # Stability variable selection
  mboost,                   # Gradient boosting for model building
  ggrepel,                  # Visualization, repels labels on plots
  bestglm,                  # Logistic regression
  knitr,                    # To change R markdown PDF options
  caret,                    # To calculate model paramters
  cutpointr,                # Calculate cutpoints
  RPostgreSQL,              # Connect to PostgreSQL server
  phyloseq,                 # Phyloseq data wrangling
  microbiomeMarker,         # LEfSe analysis
  ComplexHeatmap,           # Heatmap generator
  paletteer,                # Extensive color palette
  tidyverse,                # Data wrangling and visualization
  circlize,                 # Color ramp builder
  rstatix,                  # Tidyverse statistics
  EnhancedVolcano,          # Volcano plot analysis
  umap,                     # Uniform Manifold Approximation Projection
  ggpubr,                   # Simple ggplots with stats
  ggpirate,                 # ggplot version of Pirate plot
  mixOmics,                 # PLS-DA
  conflicted,               # Forces conflicted packages to be used by preferred package
  gridExtra,                # Arrange multiple grobs
  reticulate,               # Run python in R
  tableone,                 # Additional clinical tables support
  lubridate,                # Manipulate date objects
  yingtools2,               # Custom plotting functions
  cowplot,                  # Combine plots together
  epiR,                     # Obtain model performance measures
  pROC,                     # Produce ROC curves
  plotrix,                  # Add table to base R plot
  glmnet,                   # LASSO regression
  forcats,                  # Factor reordering
  ggpmisc,                  # Miscellaneous ggplot helper functions
  doParallel,               # Parallelize functions (for optimizing cutpoints)
  PRISMAstatement,          # Flow table generator
  DiagrammeRsvg,            # Flow table visualizer
  rsvg,                     # Flow table visualizer
  scales,                   # Scale functions for visualizations
  devtools,                 # To load packages from GitHub
  install = F
)

library("gtsummary")  # Clinical tables, pacman had trouble loading this in, so had to go the manual route

conflict_prefer("select", "dplyr")
conflict_prefer("mutate", "dplyr")
conflict_prefer("filter", "dplyr")
conflict_prefer("rename", "dplyr")
conflict_prefer("slice", "dplyr")
conflict_prefer("between", "dplyr")
conflict_prefer("annotate", "ggplot2")

devtools::source_url("https://github.com/yingeddi2008/DFIutility/blob/master/getRdpPal.R?raw=TRUE")

# ggplot theme shortcuts
et <- element_text
eb <- element_blank
er <- element_rect
el <- element_line

opts_chunk$set(tidy.opts = list(width.cutoff = 60), tidy = TRUE)

`%!in%` <- negate(`%in%`)

# Running python in R
## User must set path to their python library
reticulate::use_python("/usr/bin/python3")

# Customized lefse function
source("./Data/run_lefse2.R")

Import Data

Load in all Metagenomic, Metabolomic and Clinical Data

# 1) Load R image load('./Data/LT_Modeling.RData')

# OR #

# 2) Individually Load R Objects

# Sample lookup
sample_lookup <- readRDS("./Data/sample_lookup.rds")

# First samples from all patients
first_samps <- readRDS("./Data/first_samps_anon.rds")

# Simplified dataframe containing infection information and
# relative abundance of targeted taxa
peri_matrix_all <- readRDS("./Data/peri_matrix_clin_all.rds")

# Distinct detailed infection data
peri_criteria_best <- readRDS("./Data/peri_criteria_best_anon.rds")

# All detailed infection data
peri_criteria_all <- readRDS("./Data/peri_criteria_all_anon.rds")

# NCBI taxonomy lookup
tax_lookup <- readRDS("./Data/tax_lookup.rds")

# Complete metaphlan dataframe for all patients and healthy
# donors
metaphlan_df <- readRDS("./Data/metaphlan_anon.rds")

# Metaphlan dataframe of patient samples
metaphlan_peri_anon <- readRDS("./Data/metaphlan_peri_anon.rds")

# Custom color palette
metaphlan_pal <- getRdpPal(metaphlan_df)

# Qualitative metabolomics
metab_qual_anon <- readRDS("./Data/metab_qual_anon.rds")

# Quantitative metabolomics
metab_quant_anon <- readRDS("./Data/metab_quant_anon.rds")

# Antibiotics data
abx <- readRDS("./Data/abx_anon.rds")

# Demographics data
demo <- readRDS("./Data/demo_anon.rds")

Figures

Figure 4C: Optimal Cutpoint Analysis: Relative Abundance

# Custom function to avoid any errors during map
safe_cutpointr <- possibly(.f = cutpointr, otherwise = "Error")

# Calculate the number of cores
no_cores <- detectCores() - 2

# create the cluster for caret to use
cl <- makePSOCKcluster(no_cores)
registerDoParallel(cl)

set.seed(123456)
test_abundance <- peri_matrix_all %>%
    mutate(sampleID = as.factor(sampleID)) %>%
    select(starts_with(c("entero", "esch", "klebs", "citro",
        "rahn", "proteus")), -ends_with("abs_abundance")) %>%
    mutate_if(is.character, as.numeric) %>%
    mutate(enterococcus_infection = ifelse(`Enterococcus faecium` +
        `Enterococcus faecalis` + `Enterococcus avium` >= 1,
        1, 0), enterobacterales_infection = ifelse(`Escherichia coli` +
        `Klebsiella pneumoniae` + `Citrobacter freundii` + `Proteus mirabilis` >=
        1, 1, 0)) %>%
    select(-c(`Enterococcus faecium`, `Enterococcus faecalis`,
        `Enterococcus avium`, `Escherichia coli`, `Klebsiella pneumoniae`,
        `Citrobacter freundii`, `Proteus mirabilis`)) %>%
    pivot_longer(-c(enterococcus_infection, enterobacterales_infection),
        names_to = "variable", values_to = "value") %>%
    pivot_longer(-c(variable, value), names_to = "infection_type",
        values_to = "infection") %>%
    mutate(variable = paste0("Input: ", variable, "\nPredict: ",
        infection_type)) %>%
    group_by(infection_type, variable) %>%
    group_map(~safe_cutpointr(., value, infection, variable,
        method = maximize_metric, metric = youden, pos_class = 1,
        boot_runs = 100, allowParallel = TRUE, na.rm = T), .keep = TRUE)

# to get cutpoint object
test_abundance[4][[1]]  # ecoc rel abd, ecoc infection
## # A tibble: 1 × 18
##   subgroup                                                             direction
##   <chr>                                                                <chr>    
## 1 "Input: enterococcus_rel_abundance\nPredict: enterococcus_infection" >=       
##   optimal_cutpoint method            youden      acc sensitivity specificity
##              <dbl> <chr>              <dbl>    <dbl>       <dbl>       <dbl>
## 1         0.199356 maximize_metric 0.577090 0.723214    0.882353    0.694737
##        AUC pos_class neg_class prevalence outcome   predictor grouping
##      <dbl>     <dbl>     <dbl>      <dbl> <chr>     <chr>     <chr>   
## 1 0.829721         1         0   0.151786 infection value     variable
##   data               roc_curve            boot               
##   <list>             <list>               <list>             
## 1 <tibble [112 × 2]> <rc_ctpnt [82 × 10]> <tibble [100 × 23]>
test_abundance[1][[1]]  # ebac rel abd, ebacc infection
## # A tibble: 1 × 18
##   subgroup                                                                    
##   <chr>                                                                       
## 1 "Input: enterobacterales_rel_abundance\nPredict: enterobacterales_infection"
##   direction optimal_cutpoint method            youden    acc sensitivity
##   <chr>                <dbl> <chr>              <dbl>  <dbl>       <dbl>
## 1 >=               0.0247691 maximize_metric 0.798077 0.8125           1
##   specificity      AUC pos_class neg_class prevalence outcome   predictor
##         <dbl>    <dbl>     <dbl>     <dbl>      <dbl> <chr>     <chr>    
## 1    0.798077 0.905048         1         0  0.0714286 infection value    
##   grouping data               roc_curve            boot               
##   <chr>    <list>             <list>               <list>             
## 1 variable <tibble [112 × 2]> <rc_ctpnt [59 × 10]> <tibble [100 × 23]>
test_abundance_unnest <- test_abundance %>%
    map_df(as_tibble)


test_abundance_unnest %>%
    separate(subgroup, into = c("Input", "Predict"), sep = "\n",
        remove = F) %>%
    filter(grepl("enterobacterales", Input) & grepl("enterobacterales",
        Predict) | grepl("enterococcus", Input) & grepl("enterococcus",
        Predict)) %>%
    group_by(pos_class) %>%
    ungroup() %>%
    unnest(roc_curve) %>%
    arrange(desc(AUC)) %>%
    select(-boot) %>%
    mutate(auc_label = paste0("AUC = ", formatC(round(AUC, 3) *
        100, digits = 0, format = "f"), "%"), auc_label = case_when(grepl("enterococcus_rel_abundance",
        Input) ~ paste0(auc_label, " [", paste0(formatC(pull(round(boot_ci(x = test_abundance[4][[1]],
        AUC, alpha = 0.05)[3][1, ], 2)) * 100, digits = 0, format = "f"),
        "%"), ", ", paste0(formatC(pull(round(boot_ci(x = test_abundance[4][[1]],
        AUC, alpha = 0.05)[3][2, ], 2)) * 100, digits = 0, format = "f"),
        "%"), "]"), grepl("enterobacterales_rel_abundance", Input) ~
        paste0(auc_label, " [", paste0(formatC(pull(round(boot_ci(x = test_abundance[1][[1]],
            AUC, alpha = 0.05)[3][1, ], 2)) * 100, digits = 0,
            format = "f"), "%"), ", ", paste0(formatC(pull(round(boot_ci(x = test_abundance[1][[1]],
            AUC, alpha = 0.05)[3][2, ], 2)) * 100, digits = 0,
            format = "f"), "%"), "]")), acc_label = paste0("Accuracy = ",
        formatC(round(acc, 3) * 100, digits = 0, format = "f"),
        "%"), acc_label = case_when(grepl("enterococcus_rel_abundance",
        Input) ~ paste0(acc_label, " [", paste0(formatC(pull(round(boot_ci(x = test_abundance[4][[1]],
        acc, alpha = 0.05)[3][1, ], 2)) * 100, digits = 0, format = "f"),
        "%"), ", ", paste0(formatC(pull(round(boot_ci(x = test_abundance[4][[1]],
        acc, alpha = 0.05)[3][2, ], 2)) * 100, digits = 0, format = "f"),
        "%"), "]"), grepl("enterobacterales_rel_abundance", Input) ~
        paste0(acc_label, " [", paste0(formatC(pull(round(boot_ci(x = test_abundance[1][[1]],
            acc, alpha = 0.05)[3][1, ], 2)) * 100, digits = 0,
            format = "f"), "%"), ", ", paste0(formatC(pull(round(boot_ci(x = test_abundance[1][[1]],
            acc, alpha = 0.05)[3][2, ], 2)) * 100, digits = 0,
            format = "f"), "%"), "]")), sens_label = paste0("Sensitivity = ",
        formatC(round(sensitivity, 3) * 100, digits = 0, format = "f"),
        "%"), sens_label = case_when(grepl("enterococcus_rel_abundance",
        Input) ~ paste0(sens_label, " [", paste0(formatC(pull(round(boot_ci(x = test_abundance[4][[1]],
        sensitivity, alpha = 0.05)[3][1, ], 2)) * 100, digits = 0,
        format = "f"), "%"), ", ", paste0(formatC(pull(round(boot_ci(x = test_abundance[4][[1]],
        sensitivity, alpha = 0.05)[3][2, ], 2)) * 100, digits = 0,
        format = "f"), "%"), "]"), grepl("enterobacterales_rel_abundance",
        Input) ~ paste0(sens_label, " [", paste0(formatC(pull(round(boot_ci(x = test_abundance[1][[1]],
        sensitivity, alpha = 0.05)[3][1, ], 2)) * 100, digits = 0,
        format = "f"), "%"), ", ", paste0(formatC(pull(round(boot_ci(x = test_abundance[1][[1]],
        sensitivity, alpha = 0.05)[3][2, ], 2)) * 100, digits = 0,
        format = "f"), "%"), "]")), spec_label = paste0("Specificity = ",
        formatC(round(specificity, 3) * 100, digits = 0, format = "f"),
        "%"), spec_label = case_when(grepl("enterococcus_rel_abundance",
        Input) ~ paste0(spec_label, " [", paste0(formatC(pull(round(boot_ci(x = test_abundance[4][[1]],
        specificity, alpha = 0.05)[3][1, ], 2)) * 100, digits = 0,
        format = "f"), "%"), ", ", paste0(formatC(pull(round(boot_ci(x = test_abundance[4][[1]],
        specificity, alpha = 0.05)[3][2, ], 2)) * 100, digits = 0,
        format = "f"), "%"), "]"), grepl("enterobacterales_rel_abundance",
        Input) ~ paste0(spec_label, " [", paste0(formatC(pull(round(boot_ci(x = test_abundance[1][[1]],
        specificity, alpha = 0.05)[3][1, ], 2)) * 100, digits = 0,
        format = "f"), "%"), ", ", paste0(formatC(pull(round(boot_ci(x = test_abundance[1][[1]],
        specificity, alpha = 0.05)[3][2, ], 2)) * 100, digits = 0,
        format = "f"), "%"), "]"))) %>%
    filter(grepl(pattern = "rel_abundance", x = Input)) %>%
    mutate(subgroup = str_to_title(subgroup), subgroup = gsub(pattern = "_",
        replacement = " ", x = subgroup), subgroup = gsub(pattern = "rel abundance",
        replacement = "Relative Abundance (%)", x = subgroup),
        subgroup2 = ifelse(grepl(x = subgroup, pattern = "enterococcus",
            ignore.case = TRUE), "Enterococcus", "Enterobacterales"),
        subgroup2 = factor(subgroup2, levels = c("Enterococcus",
            "Enterobacterales"))) %>%
    ggplot(aes(x = fpr, y = tpr, color = subgroup2)) + geom_line(size = 1.2) +
    geom_text(aes(label = auc_label, x = 0.5, y = 0.13), show.legend = F,
        hjust = 0) + geom_text(aes(label = acc_label, x = 0.5,
    y = 0.09), show.legend = F, hjust = 0) + geom_text(aes(label = spec_label,
    x = 0.5, y = 0.05), show.legend = F, hjust = 0) + geom_text(aes(label = sens_label,
    x = 0.5, y = 0.01), show.legend = F, hjust = 0) + geom_text(aes(label = paste0("Cutpoint = ",
    round((optimal_cutpoint), digits = 3) * 100, "%"), x = 0.5,
    y = 0.17), hjust = 0, show.legend = F) + theme_bw() + theme(axis.text = et(color = "black",
    size = 12), axis.title = et(color = "black", size = 14),
    legend.text = et(size = 12), legend.title = et(size = 14),
    legend.spacing.y = unit(0.5, "cm"), legend.position = "none",
    panel.grid = eb(), strip.text = et(size = 14), strip.background = eb(),
    panel.border = eb(), panel.spacing.y = unit(20, "mm"), axis.line.x = el(color = "black")) +
    geom_vline(xintercept = -0.05) + geom_hline(yintercept = -0.05) +
    scale_x_continuous(expand = expansion(add = c(0.001, 0.05))) +
    scale_y_continuous(expand = expansion(add = c(0.001, 0.05))) +
    facet_wrap(~subgroup2, ncol = 1, scales = "fixed") + ylab("True Positive Rate\n") +
    xlab("\nFalse Positive Rate") + scale_color_manual(values = c("#2dc46b",
    "red")) + guides(color = guide_legend("Groups", byrow = T,
    override.aes = list(size = 5)))

ggsave(filename = "./Results/Figure_4C.pdf", height = 12, width = 7)


# obtain threshold cutpoints for relative abundance
optimal_cutpoint_rel <- test_abundance_unnest %>%
    separate(subgroup, into = c("Input", "Predict"), sep = "\n",
        remove = F) %>%
    filter(grepl("enterobacterales", Input) & grepl("enterobacterales",
        Predict) | grepl("enterococcus", Input) & grepl("enterococcus",
        Predict)) %>%
    group_by(pos_class) %>%
    filter(grepl("rel", subgroup)) %>%
    select(subgroup, optimal_cutpoint)

Figure 5A: Enterococcus Expansion: Volcano Analysis

# Heatmap compounds and their categories
heatmap_lookup <- read.csv("./Data/qual_heatmap_lookup.csv",
    stringsAsFactors = FALSE)

# Build heatmap compound list
heatmap_cmpds <- metab_qual_anon %>%
    mutate(compound = str_to_title(compound), compound = recode(compound,
        Preq1 = "PreQ1")) %>%
    filter(compound %in% heatmap_lookup$compound) %>%
    distinct(compound) %>%
    drop_na()

qual_log2fc_ecoc_expan <- metab_qual_anon %>%
    mutate(compound = ifelse(compound == "isovaleric-acid", "isovalerate",
        compound), compound = str_to_title(compound), compound = recode(compound,
        Preq1 = "PreQ1")) %>%
    filter(compound %in% heatmap_cmpds$compound) %>%
    left_join(peri_matrix_all %>%
        select(sampleID, enterococcus_rel_abundance)) %>%
    drop_na() %>%
    mutate(enterococcus_expansion = ifelse(enterococcus_rel_abundance >=
        optimal_cutpoint_rel$optimal_cutpoint[2], 1, 0)) %>%
    arrange(enterococcus_expansion, sampleID) %>%
    group_by(compound) %>%
    mutate(enterococcus_expansion_0 = length(mvalue[enterococcus_expansion ==
        "0"]), enterococcus_expansion_1 = length(mvalue[enterococcus_expansion ==
        "1"])) %>%
    filter(any(mvalue != 0)) %>%
    summarise(log2fc_val = log((mean(mvalue[enterococcus_expansion ==
        "0"], na.rm = T)/mean(mvalue[enterococcus_expansion ==
        "1"], na.rm = T)), base = 2))  # 0 = No Expansion, 1 = Expansion


qual_pval_ecoc_expan <- metab_qual_anon %>%
    mutate(compound = ifelse(compound == "isovaleric-acid", "isovalerate",
        compound), compound = str_to_title(compound), compound = recode(compound,
        Preq1 = "PreQ1")) %>%
    filter(compound %in% heatmap_cmpds$compound) %>%
    left_join(peri_matrix_all %>%
        select(sampleID, enterococcus_rel_abundance)) %>%
    drop_na() %>%
    mutate(enterococcus_expansion = ifelse(enterococcus_rel_abundance >=
        optimal_cutpoint_rel$optimal_cutpoint[2], 1, 0)) %>%
    arrange(enterococcus_expansion, sampleID) %>%
    group_by(compound) %>%
    mutate(enterococcus_expansion_0 = length(mvalue[enterococcus_expansion ==
        "0"]), enterococcus_expansion_1 = length(mvalue[enterococcus_expansion ==
        "1"])) %>%
    filter(any(mvalue != 0)) %>%
    rstatix::wilcox_test(mvalue ~ enterococcus_expansion) %>%
    rstatix::adjust_pvalue(method = "BH") %>%
    rstatix::add_significance("p.adj")

qual_tot_ecoc_expan <- left_join(qual_log2fc_ecoc_expan, qual_pval_ecoc_expan) %>%
    column_to_rownames(var = "compound")

# volcano label color
ecoc_expan_volcano_labcol <- qual_tot_ecoc_expan %>%
    filter(p.adj <= 0.05 & abs(log2fc_val) >= 0.75) %>%
    mutate(color = ifelse(log2fc_val < 0, "#389458", "gray47"))

# Volcano Plot (adjusted)
set.seed(123456)
volcano_adj_ecoc_expan <- EnhancedVolcano(qual_tot_ecoc_expan,
    lab = rownames(qual_tot_ecoc_expan), x = "log2fc_val", y = "p.adj",
    title = NULL, pCutoff = 0.05, FCcutoff = 0.75, pointSize = 4,
    labSize = 4, labCol = ecoc_expan_volcano_labcol$color, caption = NULL,
    colAlpha = 0.65, col = c("gray75", c("#D4CA15", "#912777",
        "#1238E3")), xlim = c(-2.5, 4), ylim = c(0, 8), legendPosition = "right",
    legendLabels = c(expression(p.adj > 0.05 * ";" ~ Log[2] ~
        FC < "±" * 0.75), expression(p.adj > 0.05 * ";" ~ Log[2] ~
        FC >= "±" * 0.75), expression(p.adj <= 0.05 * ";" ~
        Log[2] ~ FC < "±" * 0.75), expression(p.adj <= 0.05 *
        ";" ~ Log[2] ~ FC >= "±" * 0.75)), legendLabSize = 14,
    boxedLabels = T, drawConnectors = T, widthConnectors = 0.2,
    arrowheads = F, gridlines.minor = F, gridlines.major = F,
    max.overlaps = Inf) + theme(axis.text = et(color = "black"),
    legend.text = et(hjust = 0), plot.margin = unit(c(0, 4, 2,
        4), "cm")) + labs(subtitle = NULL) + annotate("segment",
    x = 0.8, xend = 3, y = 7.95, yend = 7.95, arrow = arrow(),
    size = 2, color = "gray67") + annotate("text", x = 1.65,
    y = 8.15, label = "No Expansion", size = 6, color = "gray67") +
    annotate("rect", xmin = 0.75, xmax = Inf, ymin = -log(0.05,
        base = 10), ymax = Inf, alpha = 0.1, fill = "gray87") +
    annotate("segment", x = -0.8, xend = -2.5, y = 7.95, yend = 7.95,
        arrow = arrow(), size = 2, color = "#389458") + annotate("text",
    x = -1.55, y = 8.15, label = "Expansion", size = 6, color = "#389458") +
    annotate("rect", xmin = -0.75, xmax = -Inf, ymin = -log(0.05,
        base = 10), ymax = Inf, alpha = 0.1, fill = "#389458") +
    guides(color = guide_legend(nrow = 4), shape = guide_legend(nrow = 4)) +
    ggtitle("Enterococcus Expansion") + scale_y_continuous(expand = expansion(add = c(0,
    0.15)))

volcano_adj_ecoc_expan

ggsave(plot = volcano_adj_ecoc_expan, filename = "./Results/Figure_5A.pdf",
    width = 24, height = 8)

Enterococcus Expansion: Distribution

# Using unadjusted p-values for down-selection of
# metabolites, show distribution of normalized peak area
boxplot_ecoc_expan <- metab_qual_anon %>%
    mutate(compound = ifelse(compound == "isovaleric-acid", "isovalerate",
        compound), compound = str_to_title(compound), compound = recode(compound,
        Preq1 = "PreQ1")) %>%
    filter(compound %in% heatmap_cmpds$compound) %>%
    left_join(peri_matrix_all %>%
        select(sampleID, enterococcus_rel_abundance)) %>%
    drop_na() %>%
    mutate(enterococcus_expansion = ifelse(enterococcus_rel_abundance >=
        optimal_cutpoint_rel$optimal_cutpoint[2], 1, 0), enterococcus_expansion = ifelse(enterococcus_expansion ==
        1, "Expansion", "No Expansion")) %>%
    arrange(enterococcus_expansion, sampleID) %>%
    group_by(compound) %>%
    mutate(enterococcus_expansion_0 = length(mvalue[enterococcus_expansion ==
        "No Expansion"]), enterococcus_expansion_1 = length(mvalue[enterococcus_expansion ==
        "Expansion"])) %>%
    filter(any(mvalue != 0)) %>%
    right_join(qual_tot_ecoc_expan %>%
        rownames_to_column(var = "compound") %>%
        mutate(abs_log2fc_val = abs(log2fc_val)) %>%
        filter(p <= 0.05, abs_log2fc_val >= 1))

if (nrow(boxplot_ecoc_expan) > 0) {
    print(ggpar(ggboxplot(boxplot_ecoc_expan, x = "enterococcus_expansion",
        y = "mvalue", color = "enterococcus_expansion", palette = c("#389458",
            "gray87"), ylab = "Normalized Peak Area", xlab = "",
        outlier.shape = NA), legend.title = "Enterococcus") +
        stat_compare_means(label.y.npc = 0.75) + facet_wrap(~compound,
        scales = "free_y") + geom_point(data = boxplot_ecoc_expan,
        aes(x = enterococcus_expansion, y = mvalue, color = enterococcus_expansion),
        position = position_jitter(width = 0.2), size = 2, alpha = 0.65))

    ggsave(filename = "./Results/enterococcus_expansion_boxplot.pdf",
        width = 24, height = 18)
} else {
    print("no significant observations")
}

taxUMAP Data Preparation

# Custom colors
pirate_colors <- c("#1A49BE", "#3A001E")
pirate_colors2 <- c("#3A001E", "#3A001E", "#3A001E", "#1A49BE")

t_metaphlan <- metaphlan_df %>%
    filter(sampleID %in% first_samps$sampleID | grepl(sampleID,
        pattern = "hd")) %>%
    mutate(db = ifelse(grepl(sampleID, pattern = "lt"), "Liver Transplant",
        "Healthy Donor")) %>%
    select(sampleID, taxid, db, pctseqs, Total) %>%
    group_by(sampleID, taxid, pctseqs) %>%
    slice(1) %>%
    ungroup() %>%
    filter(pctseqs >= 1e-04) %>%
    group_by(sampleID) %>%
    dplyr::add_count(taxid, name = "totalSp") %>%
    mutate(sampleID_count = length(unique(sampleID)), spPres = totalSp/sampleID_count) %>%
    filter(spPres >= 0.1) %>%
    select(-c(Total, sampleID_count, spPres, totalSp)) %>%
    group_by(sampleID) %>%
    mutate(pctseqs = pctseqs/sum(pctseqs))


t_metaphlan_mat <- t_metaphlan %>%
    distinct() %>%
    pivot_wider(names_from = "taxid", values_from = "pctseqs",
        values_fill = 0) %>%
    column_to_rownames(var = "sampleID") %>%
    select(-db)


# taxUMAP microbiota table
taxumap_microbiota <- t_metaphlan_mat %>%
    rownames_to_column(var = "index_column") %>%
    pivot_longer(-index_column, names_to = "taxid", values_to = "pctseq") %>%
    mutate(taxid = paste0("taxID", taxid)) %>%
    pivot_wider(names_from = "taxid", values_from = "pctseq")

write.csv(taxumap_microbiota, "./Results/microbiota_table.csv",
    row.names = FALSE)

# taxUMAP taxonomy table
taxumap_taxonomy <- t_metaphlan_mat %>%
    rownames_to_column(var = "index_column") %>%
    pivot_longer(-index_column, names_to = "taxid", values_to = "pctseq") %>%
    left_join(tax_lookup %>%
        mutate(taxid = as.character(taxid))) %>%
    mutate(taxid = paste0("taxID", taxid)) %>%
    distinct(Kingdom, Phylum, Class, Order, Family, Genus, Species,
        taxid) %>%
    transmute(OTU = taxid, Kingdom, Phylum = if_else(Phylum ==
        "", "unclassified", Phylum), Class = if_else(Class ==
        "", "unclassified", Class), Order = if_else(Order ==
        "", "unclassified", Order), Family = if_else(Family ==
        "", "unclassified", Family), Genus = if_else(Genus ==
        "", "unclassified", Genus), Genus = gsub(" ", "_", Genus),
        Species = if_else(Species == "", "unclassified", Species),
        Species = gsub(" ", "_", Species))
write.csv(taxumap_taxonomy, "./Results/taxonomy.csv", row.names = FALSE)

TaxUMAP Analysis

import sys

print(sys.path)

# USER MUST SET THEIR PATH TO THEIR LOCALLY INSTALLED TAXUMAP LOCATION
## ['', '/usr/bin', '/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.8/lib/python38.zip', '/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.8/lib/python3.8', '/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.8/lib/python3.8/lib-dynload', '/Users/nick/Library/Python/3.8/lib/python/site-packages', '/opt/anaconda3/bin/taxumap', '/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.8/lib/python3.8/site-packages', '/Library/Frameworks/R.framework/Versions/4.2/Resources/library/reticulate/python']
sys.path.insert(0, '/Users/nick/Documents/taxumap/taxumap/')

from taxumap.taxumap_base import Taxumap

# From file
tu = Taxumap(taxonomy='./Results/taxonomy.csv', microbiota_data='./Results/microbiota_table.csv',random_state=456)

# Transform the data (an inplace function)
## Phylum Class
## not monophyletic
## Class Order
## not monophyletic
## Order Family
## not monophyletic
## Family Genus
## not monophyletic
## post validate inputs main                Kingdom  ...                          Species
## ASV                     ...                                 
## taxID817      Bacteria  ...             Bacteroides_fragilis
## taxID818      Bacteria  ...     Bacteroides_thetaiotaomicron
## taxID820      Bacteria  ...            Bacteroides_uniformis
## taxID821      Bacteria  ...             Phocaeicola_vulgatus
## taxID823      Bacteria  ...       Parabacteroides_distasonis
## ...                ...  ...                              ...
## taxID28447    Bacteria  ...        Clavibacter_michiganensis
## taxID33968    Bacteria  ...  Leuconostoc_pseudomesenteroides
## taxID709323   Bacteria  ...         Fructobacillus_tropaeoli
## taxID1070421  Bacteria  ...            Periweissella_fabalis
## taxID2749962  Bacteria  ...         Lactococcus_paracarnosus
## 
## [672 rows x 7 columns]
## 
## /opt/anaconda3/bin/taxumap/taxumap/dataloading.py:86: UserWarning: Reading taxonomy table. Assuming columns are ordered by phylogeny with in descending order of hierarchy:
##                  e.g. Kingdom, Phylum, ... , Genus, Species, etc.
##                  Additionally, the OTU or ASV column must be labeled as 'OTU' or 'ASV' unless otherwise specified
##   warnings.warn(
tu.transform_self(neigh=55)

# Raw embedding dataframe
## Taxumap(agg_levels = ['Phylum', 'Family'], weights = [1, 1])
## 
## /opt/anaconda3/bin/taxumap/taxumap/taxumap_base.py:91: UserWarning: Setting min_dist to 0.05/sum(weights)
##   warnings.warn("Setting min_dist to 0.05/sum(weights)")
## /opt/anaconda3/bin/taxumap/taxumap/taxumap_base.py:102: UserWarning: Setting epochs to 5000
##   warnings.warn("Setting epochs to %d" % epochs)
## /opt/anaconda3/bin/taxumap/taxumap/tools.py:47: UserWarning: aggregating on Phylum
##   warnings.warn("aggregating on %s" % agg_level)
## /opt/anaconda3/bin/taxumap/taxumap/tools.py:47: UserWarning: aggregating on Family
##   warnings.warn("aggregating on %s" % agg_level)
embedding_df = tu.df_embedding

Figure 1: Shotgun Metagenomics, taxUMAP and Diversity

#### Alpha Diversity ####
# Alpha diversity matrix: Inverse Simpson
alpha_invsim <- vegan::diversity(t_metaphlan_mat, index = "invsimpson") %>%
  as.data.frame() %>% 
  rownames_to_column(var = "sampleID") %>% 
  dplyr::rename("InvSimpson" = ".")

# Alpha Diversity matrix: Shannon
alpha_shannon <- vegan::diversity(t_metaphlan_mat, index = "shannon") %>%
  as.data.frame() %>% 
  rownames_to_column(var = "sampleID") %>% 
  dplyr::rename("Shannon" = ".")

# Alpha Diversity matrix: Observed ASVs
alpha_richness <- vegan::specnumber(t_metaphlan_mat) %>%
  as.data.frame() %>% 
  rownames_to_column(var = "sampleID") %>% 
  dplyr::rename("Richness" = ".")

#### taxUMAP #####
# Find most abundant taxa per sample and plot just that
top_tax <- t_metaphlan %>%
  group_by(sampleID) %>%
  slice_max(pctseqs, n = 1) %>%
  left_join(tax_lookup) %>% 
  mutate(across(everything(), ~ifelse(.=="", NA, as.character(.)))) %>% 
  replace_na(list(Species="unclassified",
                  Genus="unclassified",
                  Family="unclassified",
                  Order="unclassified",
                  Class="unclassified",
                  Phylum="unclassified")) %>%
  mutate(Genus=ifelse(Genus=="unclassified",
                      paste(Family,Genus,sep="\n"),
                      as.character(Genus)),
         pctseqs = as.numeric(pctseqs))


metaphlan_df2 <- t_metaphlan %>% 
  mutate(db = factor(db, levels = c("Liver Transplant", "Healthy Donor"))) %>% 
  left_join(tax_lookup) %>% 
  drop_na(taxid) %>% 
  arrange(Kingdom, Phylum, Class, Order, Family, Genus) %>%
  mutate(Genus = paste0(Phylum,"-",Order,"-", Family, "-",Genus)) %>% 
  left_join(alpha_shannon) %>%
  group_by(sampleID) %>% 
  arrange(Genus) %>% 
  mutate(cum.pct = cumsum(pctseqs), 
         y.text = (cum.pct + c(0, cum.pct[-length(cum.pct)]))/2) %>% 
  ungroup() %>%
  dplyr::select(-cum.pct)

metaphlan_df_sumry <- 
  metaphlan_df2 %>% 
  group_by(sampleID) %>% 
  dplyr::slice(1) %>% 
  ungroup() %>% 
  mutate(healthy_min_shannon = min(Shannon[db == "Healthy Donor"]),
         diversity_group = case_when(
           db == "Healthy Donor" ~ "Healthy Donor",
           Shannon >= healthy_min_shannon ~ "High Diversity",
           TRUE ~ "TBD"
         ),
         lt_med_shannon = median(Shannon[diversity_group == "TBD"], na.rm = TRUE),
         diversity_group = case_when(
           diversity_group %in% c("Healthy Donor", "High Diversity") ~ diversity_group,
           Shannon >= lt_med_shannon ~ "Medium Diversity",
           TRUE ~ "Low Diversity"
         ),
         diversity_group = factor(
           diversity_group,
           levels = c(
             "Low Diversity",
             "Medium Diversity",
             "High Diversity",
             "Healthy Donor"
           )
         ),
         diversity_group_abv = gsub(pattern = " Diversity",
                                    replacement = "",
                                    x = diversity_group),
         diversity_group_abv = factor(diversity_group_abv,
                                      levels = c("Low", "Medium", "High", "Healthy Donor"))) %>% 
  arrange(Genus)


tax_umap_mat_plot <- py$embedding_df %>% 
  as.data.frame() %>% 
  rownames_to_column(var = "sampleID") %>% 
  left_join(t_metaphlan %>% 
              distinct(sampleID, db)) %>%
  left_join(top_tax) %>% 
  ggplot(aes(x = taxumap1, y = taxumap2, color = Genus, shape = db, size = pctseqs*100, alpha = db))+
  geom_point(fill = "black")+
  theme_bw() +
  theme(panel.grid = eb(),
        axis.title = et(color = "black", size = 14),
        axis.text = et(color = "black", size = 12),
        legend.title = et(color = "black", size = 14),
        legend.text = et(color = "black", size = 12),
        legend.position = "top"
        ) + 
  xlab("taxUMAP1") +
  ylab("taxUMAP1") +
  guides(
    shape = guide_legend(
      title = "Cohort",
      override.aes = list(size = 4),
      order = 1,
      nrow = 2,
      title.position = "top",
      title.hjust = 0.5
    ),
    size = guide_legend(
      title = "Relative Abundance (%)",
      order = 2,
      nrow = 2,
      title.position = "top",
      title.hjust = 0.5
    ),
    color = "none",
    fill = "none",
    alpha = "none"
  )+
  scale_color_manual(values = metaphlan_pal)+
  scale_shape_manual(values = c(24, 16))+
  scale_alpha_manual(values = c(1, 0.45))

tax_umap_mat_plot

ggsave(plot = tax_umap_mat_plot, 
            filename = "./Results/Figure_1B.pdf",
            height = 8, width = 8)

#### Alpha Diversity Stats #####
# Obtain stats for alpha diversity
diversity_comps <- list(
  c("Healthy Donor", "High"),
  c("High", "Medium"),
  c("High", "Low"),
  c("Medium", "Low")
  )

alpha_stats <- alpha_invsim %>% 
  left_join(alpha_shannon) %>% 
  left_join(alpha_richness) %>% 
  inner_join(metaphlan_df2 %>%
               left_join(metaphlan_df_sumry %>% select(sampleID, db, diversity_group_abv)) %>% 
               distinct(sampleID, db, diversity_group_abv)
             ) %>% 
  pivot_longer(!c(sampleID, db, diversity_group_abv),names_to = "diversity_metric", values_to = "value") %>% 
  group_by(diversity_metric) %>% 
  rstatix::wilcox_test(value~diversity_group_abv,
                        comparisons = diversity_comps,
                       p.adjust.method = "BH",
                       alternative= "two.sided"
                       ) %>% 
  ungroup()

# Create dataframe for all phylogentic levels of interest
phylo_rel_abd <- t_metaphlan %>% 
  left_join(tax_lookup) %>% 
  inner_join(metaphlan_df2 %>%
               left_join(metaphlan_df_sumry %>% select(sampleID, db, diversity_group_abv)) %>% 
               distinct(sampleID, db, diversity_group_abv)
             ) %>% 
  mutate(Species = paste(Kingdom, Phylum, Class, Order, Family, Genus, Species, sep = "|")) %>% 
  filter(grepl(pattern = "Enterococcus|Enterobacterales|Bacteroidetes|Lachnospiraceae|Oscillospiraceae", x = Species)) %>% 
  mutate(organism = case_when(grepl(pattern = "Enterococcus", x = Species) ~ "Enterococcus",
                              grepl(pattern = "Enterobacterales", x = Species) ~ "Enterobacterales",
                              grepl(pattern = "Bacteroidetes", x = Species) ~ "Bacteroidetes",
                              grepl(pattern = "Lachnospiraceae", x = Species) ~ "Lachnospiraceae",
                              grepl(pattern = "Oscillospiraceae", x = Species) ~ "Oscillospiraceae")) %>% 
  select(sampleID, db, diversity_group_abv, organism, pctseqs) %>% 
  group_by(sampleID, db, diversity_group_abv, organism) %>% 
  summarise(pctseqs = sum(pctseqs)) %>% 
  ungroup() %>% 
  pivot_wider(names_from = organism, values_from = pctseqs, values_fill = 0) %>% 
  pivot_longer(!c(sampleID, db, diversity_group_abv), names_to = "organism", values_to = "pctseqs")

# Obtain stats for all phylogentic levels of interest
rel_abd_alpha_stats <- phylo_rel_abd %>%  
  group_by(organism) %>% 
  rstatix::wilcox_test(pctseqs~diversity_group_abv) %>% 
  bind_rows(alpha_stats) %>% 
  rstatix::adjust_pvalue(method = "BH") %>% 
  mutate(p.adj = ifelse(p.adj < 0.001, 0.001, round(p.adj, 3)))

write.csv(rel_abd_alpha_stats, "./Results/Figure_1_Statistics.csv", row.names = FALSE)

#### Plot Inverse Simpson ####
symnum.args <- list(cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, Inf), symbols = c("****", "***", "**", "*", "ns"))

diversity_group_colors <- c("#3A001E", "#8A0246", "#C20463", "#1A49BE")

set.seed(456)
gg_alpha_invsim <- alpha_invsim %>% 
  inner_join(t_metaphlan %>% 
               distinct(sampleID, db) %>% 
               select(sampleID, db)
             ) %>% 
  inner_join(metaphlan_df %>%
               left_join(metaphlan_df_sumry %>% select(sampleID, db, diversity_group, diversity_group_abv)) %>% 
               distinct(sampleID, db, diversity_group, diversity_group_abv)
             ) %>% 
  mutate(db = factor(db, levels = c("Liver Transplant", "Healthy Donor"))) %>% 
  ggplot(., aes(x = diversity_group_abv, 
                y = InvSimpson, 
                color = diversity_group_abv, 
                fill = db)) +
  geom_boxplot(outlier.colour = NA,
               alpha = 0.35) +
  geom_jitter(width = 0.2, size = 2.5, alpha = 0.65)+
  stat_compare_means(comparisons = diversity_comps,
                     tip.length = 0.01,
                     step.increase = 0.075,
                     symnum.args = symnum.args,
                     method.args = list(alternative = "two.sided",
                                        exact = FALSE)
                     ) +
  theme_bw() +
  theme(
    panel.grid = eb(),
    axis.title.y = et(size = 14, color = "black"),
    axis.title.x = eb(),
    axis.text = et(size = 12, color = "black"),
    plot.margin = margin(t = 5,  # Top margin
                         r = 5,  # Right margin
                         b = 5,  # Bottom margin
                         l = 5), # Left margin
    panel.border = eb(),
    axis.line = el(color = 'black')
  ) +
  ylab("Alpha Diversity\n(Inverse Simpson)") +
  scale_fill_manual(values = rev(pirate_colors)) +
  scale_color_manual(values = diversity_group_colors) +
  guides(fill = guide_legend("Cohort"),
         color = guide_legend("Diversity Group",
                              override.aes = aes(label = "")))+
  scale_y_continuous(breaks = seq(0,45,5),
                     expand = expansion(mult = c(0.01, 0.1))) +
  coord_cartesian(xlim = c(1.1,3.9))
## [1] FALSE
gg_alpha_invsim

pdf(file = "./Results/Figure_1C.pdf", height = 6, width = 7)
gg_alpha_invsim
invisible(dev.off())

#### Plot Shannon Diversity ####
set.seed(456)
gg_alpha_shannon <- alpha_shannon %>% 
  inner_join(t_metaphlan %>% 
               distinct(sampleID, db) %>% 
               select(sampleID, db)
             ) %>% 
  inner_join(metaphlan_df %>%
               left_join(metaphlan_df_sumry %>% select(sampleID, db, diversity_group, diversity_group_abv)) %>% 
               distinct(sampleID, db, diversity_group, diversity_group_abv)
             ) %>% 
  mutate(db = factor(db, levels = c("Liver Transplant", "Healthy Donor"))) %>% 
  ggplot(., aes(x = diversity_group_abv, 
                y = Shannon, 
                color = diversity_group_abv, 
                fill = db)) +
  geom_boxplot(outlier.colour = NA,
               alpha = 0.35) +
  geom_jitter(width = 0.2, size = 2.5, alpha = 0.65)+
  stat_compare_means(comparisons = diversity_comps,
                     tip.length = 0.01,
                     step.increase = 0.075,
                     symnum.args = symnum.args,
                     method.args = list(alternative = "two.sided",
                                        exact = FALSE)
                     ) + 
  theme_bw() +
  theme(
    panel.grid = eb(),
    axis.title.y = et(size = 14, color = "black"),
    axis.title.x = eb(),
    axis.text = et(size = 12, color = "black"),
    plot.margin = margin(t = 5,  # Top margin
                         r = 5,  # Right margin
                         b = 5,  # Bottom margin
                         l = 5), # Left margin
    panel.border = eb(),
    axis.line = el(color = 'black')
  ) +
  ylab("Alpha Diversity\n(Shannon)") +
  scale_fill_manual(values = rev(pirate_colors)) +
  scale_color_manual(values = diversity_group_colors) +
  guides(fill = guide_legend("Cohort"),
         color = guide_legend("Diversity Group",
                              override.aes = aes(label = "")))+
  scale_y_continuous(breaks = seq(0,6.5,1),
                     expand = expansion(mult = c(0.01, 0.1))) +
  coord_cartesian(xlim = c(1.1,3.9))
## [1] FALSE
gg_alpha_shannon

pdf(file = "./Results/Figure_1D.pdf", height = 6, width = 7)
gg_alpha_shannon
invisible(dev.off())

#### Plot Species Richness ####
set.seed(456)
gg_alpha_richness <- alpha_richness %>% 
  inner_join(t_metaphlan %>% 
               distinct(sampleID, db) %>% 
               select(sampleID, db)
             ) %>% 
  inner_join(metaphlan_df %>%
               left_join(metaphlan_df_sumry %>% select(sampleID, db, diversity_group, diversity_group_abv)) %>% 
               distinct(sampleID, db, diversity_group, diversity_group_abv)
             ) %>% 
  mutate(db = factor(db, levels = c("Liver Transplant", "Healthy Donor"))) %>% 
  ggplot(., aes(x = diversity_group_abv, 
                y = Richness, 
                color = diversity_group_abv, 
                fill = db)) +
  geom_boxplot(outlier.colour = NA,
               alpha = 0.35) +
  geom_jitter(width = 0.2, size = 2.5, alpha = 0.65)+
  stat_compare_means(comparisons = diversity_comps,
                     tip.length = 0.01,
                     label.y = c(145, 160, 150, 110),
                     symnum.args = symnum.args,
                     method.args = list(alternative = "two.sided",
                                        exact = FALSE)
                     ) +
  theme_bw() +
  theme(
    panel.grid = eb(),
    axis.title.y = et(size = 14, color = "black"),
    axis.title.x = eb(),
    axis.text = et(size = 12, color = "black"),
    plot.margin = margin(t = 5,  # Top margin
                         r = 5,  # Right margin
                         b = 5,  # Bottom margin
                         l = 5), # Left margin
    panel.border = eb(),
    axis.line = el(color = 'black')
  ) +
  ylab("Alpha Diversity\n(Richness)") +
  scale_fill_manual(values = rev(pirate_colors)) +
  scale_color_manual(values = diversity_group_colors) +
  guides(fill = guide_legend("Cohort"),
         color = guide_legend("Diversity Group",
                              override.aes = aes(label = "")))+
  scale_y_continuous(breaks = seq(0,180,50),
                     expand = expansion(mult = c(0.01, 0.035))) +
  coord_cartesian(xlim = c(1.1,3.9))
## [1] FALSE
gg_alpha_richness

pdf(file = "./Results/Figure_1E.pdf", height = 6, width = 7)
gg_alpha_richness
invisible(dev.off())

#### Plot Enterococcus ####
set.seed(456)
gg_ecoc_rel_abd <- phylo_rel_abd %>% 
  filter(organism == "Enterococcus") %>% 
  group_by(organism) %>% 
  mutate(db = factor(db, levels = c("Liver Transplant", "Healthy Donor"))) %>% 
  ggplot(., aes(x = diversity_group_abv, 
                y = pctseqs, 
                color = diversity_group_abv, 
                fill = db)) +
  geom_boxplot(outlier.colour = NA,
               alpha = 0.35) +
  geom_jitter(width = 0.2, size = 2.5, alpha = 0.65)+
  stat_compare_means(comparisons = diversity_comps,
                     tip.length = 0.01,
                     symnum.args = symnum.args,
                     method.args = list(alternative = "two.sided",
                                        exact = FALSE),
                    
                     label.y = c(0.20, 0.75, 0.985, 1.05)
                     ) +
  theme_bw() +
  theme(
    panel.grid = eb(),
    axis.title.y = et(size = 14, color = "black"),
    axis.title.x = eb(),
    axis.text = et(size = 12, color = "black"),
    plot.margin = margin(t = 5,  # Top margin
                         r = 5,  # Right margin
                         b = 5,  # Bottom margin
                         l = 5), # Left margin
    panel.border = eb(),
    axis.line = el(color = 'black')
  ) +
  ylab(~atop(paste(italic("Enterococcus")), paste("MetaPhlAn4 Relative Abundance"))) +
  scale_fill_manual(values = rev(pirate_colors)) +
  scale_color_manual(values = diversity_group_colors) +
  guides(fill = guide_legend("Cohort"),
         color = guide_legend("Diversity Group",
                              override.aes = aes(label = "")))+
  scale_y_continuous(breaks = seq(0,1,0.1),
                     expand = expansion(mult = c(0.01, 0.035)),
                     labels = scales::percent_format(accuracy = 1)) +
  coord_cartesian(xlim = c(1.1,3.9))
## [1] FALSE
gg_ecoc_rel_abd

pdf(file = "./Results/Figure_1F.pdf", height = 6, width = 7)
gg_ecoc_rel_abd
invisible(dev.off())

#### Plot Enterobacterales #####
set.seed(456)
gg_ebac_rel_abd <- phylo_rel_abd %>% 
  filter(organism == "Enterobacterales") %>% 
  group_by(organism) %>% 
  mutate(db = factor(db, levels = c("Liver Transplant", "Healthy Donor"))) %>% 
  ggplot(., aes(x = diversity_group_abv, 
                y = pctseqs, 
                color = diversity_group_abv, 
                fill = db)) +
  geom_boxplot(outlier.colour = NA,
               alpha = 0.35) +
  geom_jitter(width = 0.2, size = 2.5, alpha = 0.65)+
  stat_compare_means(comparisons = diversity_comps,
                     tip.length = 0.01,
                     symnum.args = symnum.args,
                     method.args = list(alternative = "two.sided",
                                        exact = FALSE),
                     label.y = c(0.20, 0.75, 0.985, 1.05)
                     ) +
  theme_bw() +
  theme(
    panel.grid = eb(),
    axis.title.y = et(size = 14, color = "black"),
    axis.title.x = eb(),
    axis.text = et(size = 12, color = "black"),
    plot.margin = margin(t = 5,  # Top margin
                         r = 5,  # Right margin
                         b = 5,  # Bottom margin
                         l = 5), # Left margin
    panel.border = eb(),
    axis.line = el(color = 'black')
  ) +
  ylab(~atop(paste(italic("Enterobacterales")), paste("MetaPhlAn4 Relative Abundance"))) +
  scale_fill_manual(values = rev(pirate_colors)) +
  scale_color_manual(values = diversity_group_colors) +
  guides(fill = guide_legend("Cohort"),
         color = guide_legend("Diversity Group",
                              override.aes = aes(label = "")))+
  scale_y_continuous(breaks = seq(0,1,0.1),
                     expand = expansion(mult = c(0.01, 0.035)),
                     labels = scales::percent_format(accuracy = 1)) +
  coord_cartesian(xlim = c(1.1,3.9))
## [1] FALSE
gg_ebac_rel_abd

pdf(file = "./Results/Figure_1G.pdf", height = 6, width = 7)
gg_ebac_rel_abd
invisible(dev.off())

#### Plot Bacteroidetes ####
set.seed(456)
gg_bact_rel_abd <- phylo_rel_abd %>% 
  filter(organism == "Bacteroidetes") %>% 
  group_by(organism) %>% 
  mutate(db = factor(db, levels = c("Liver Transplant", "Healthy Donor"))) %>% 
  ggplot(., aes(x = diversity_group_abv, 
                y = pctseqs, 
                color = diversity_group_abv, 
                fill = db)) +
  geom_boxplot(outlier.colour = NA,
               alpha = 0.35) +
  geom_jitter(width = 0.2, size = 2.5, alpha = 0.65)+
  stat_compare_means(comparisons = diversity_comps,
                     tip.length = 0.01,
                     symnum.args = symnum.args,
                     method.args = list(alternative = "two.sided",
                                        exact = FALSE),
                     label.y = c(0.65, 0.98, 0.92, 1.02)
                     ) + 
  theme_bw() +
  theme(
    panel.grid = eb(),
    axis.title.y = et(size = 14, color = "black"),
    axis.title.x = eb(),
    axis.text = et(size = 12, color = "black"),
    plot.margin = margin(t = 5,  # Top margin
                         r = 5,  # Right margin
                         b = 5,  # Bottom margin
                         l = 5), # Left margin
    panel.border = eb(),
    axis.line = el(color = 'black')
  ) +
  ylab(~atop(paste(italic("Bacteroidetes")), paste("MetaPhlAn4 Relative Abundance"))) +
  scale_fill_manual(values = rev(pirate_colors)) +
  scale_color_manual(values = diversity_group_colors) +
  guides(fill = guide_legend("Cohort"),
         color = guide_legend("Diversity Group",
                              override.aes = aes(label = "")))+
  scale_y_continuous(breaks = seq(0,1,0.1),
                     expand = expansion(mult = c(0.01, 0.035)),
                     labels = scales::percent_format(accuracy = 1)) +
  coord_cartesian(xlim = c(1.1,3.9))
## [1] FALSE
gg_bact_rel_abd

pdf(file = "./Results/Figure_1H.pdf", height = 6, width = 7)
gg_bact_rel_abd
invisible(dev.off())

#### Plot Lachnospiraceae ####
set.seed(456)
gg_lach_rel_abd <- phylo_rel_abd %>% 
  filter(organism == "Lachnospiraceae") %>% 
  group_by(organism) %>% 
  mutate(db = factor(db, levels = c("Liver Transplant", "Healthy Donor"))) %>% 
  ggplot(., aes(x = diversity_group_abv, 
                y = pctseqs, 
                color = diversity_group_abv, 
                fill = db)) +
  geom_boxplot(outlier.colour = NA,
               alpha = 0.35) +
  geom_jitter(width = 0.2, size = 2.5, alpha = 0.65)+
  stat_compare_means(comparisons = diversity_comps,
                     tip.length = 0.01,
                     symnum.args = symnum.args,
                     method.args = list(alternative = "two.sided",
                                        exact = FALSE),
                     label.y = c(0.65, 0.71, 0.76, 0.82)
                     ) +
  theme_bw() +
  theme(
    panel.grid = eb(),
    axis.title.y = et(size = 14, color = "black"),
    axis.title.x = eb(),
    axis.text = et(size = 12, color = "black"),
    plot.margin = margin(t = 5,  # Top margin
                         r = 5,  # Right margin
                         b = 5,  # Bottom margin
                         l = 5), # Left margin
    panel.border = eb(),
    axis.line = el(color = 'black')
  ) +
  ylab(~atop(paste(italic("Lachnospiraceae")), paste("MetaPhlAn4 Relative Abundance"))) +
  scale_fill_manual(values = rev(pirate_colors)) +
  scale_color_manual(values = diversity_group_colors) +
  guides(fill = guide_legend("Cohort"),
         color = guide_legend("Diversity Group",
                              override.aes = aes(label = "")))+
  scale_y_continuous(breaks = seq(0,1,0.1),
                     expand = expansion(mult = c(0.01, 0.035)),
                     labels = scales::percent_format(accuracy = 1)) +
  coord_cartesian(xlim = c(1.1,3.9))
## [1] FALSE
gg_lach_rel_abd

pdf(file = "./Results/Figure_1I.pdf", height = 6, width = 7)
gg_lach_rel_abd
invisible(dev.off())

#### Plot Oscillospiraceae ####
set.seed(456)
gg_oscl_rel_abd <- phylo_rel_abd %>% 
  filter(organism == "Oscillospiraceae") %>% 
  group_by(organism) %>% 
  mutate(db = factor(db, levels = c("Liver Transplant", "Healthy Donor"))) %>% 
  ggplot(., aes(x = diversity_group_abv, 
                y = pctseqs, 
                color = diversity_group_abv, 
                fill = db)) +
  geom_boxplot(outlier.colour = NA,
               alpha = 0.35) +
  geom_jitter(width = 0.2, size = 2.5, alpha = 0.65)+
  stat_compare_means(comparisons = diversity_comps,
                     tip.length = 0.01,
                     symnum.args = symnum.args,
                     method.args = list(alternative = "two.sided",
                                        exact = FALSE),
                     label.y = c(0.33, 0.52, 0.59, 0.66)
                     ) +
  theme_bw() +
  theme(
    panel.grid = eb(),
    axis.title.y = et(size = 14, color = "black"),
    axis.title.x = eb(),
    axis.text = et(size = 12, color = "black"),
    plot.margin = margin(t = 5,  # Top margin
                         r = 5,  # Right margin
                         b = 5,  # Bottom margin
                         l = 5), # Left margin
    panel.border = eb(),
    axis.line = el(color = 'black')
  ) +
  ylab(~atop(paste(italic("Oscillospiraceae")), paste("MetaPhlAn4 Relative Abundance"))) +
   scale_fill_manual(values = rev(pirate_colors)) +
  scale_color_manual(values = diversity_group_colors) +
  guides(fill = guide_legend("Cohort"),
         color = guide_legend("Diversity Group",
                              override.aes = aes(label = "")))+
  scale_y_continuous(breaks = seq(0,1,0.1),
                     expand = expansion(mult = c(0.01, 0.035)),
                     labels = scales::percent_format(accuracy = 1)) +
  coord_cartesian(xlim = c(1.1,3.9))
## [1] FALSE
gg_oscl_rel_abd

pdf(file = "./Results/Figure_1J.pdf", height = 6, width = 7)
gg_oscl_rel_abd
invisible(dev.off())

#### Plot Metaphlan Relative Abundance ####
# Figure 1A-MetaPhlAn4 Taxonomy
# Load legend
tax_legend <- magick::image_read_pdf("./Data/legend.v2.pdf")

gg_tax_legend <- cowplot::ggdraw() + cowplot::draw_image(tax_legend)

metaphlan_pal2 <- getRdpPal(metaphlan_df2)

gg_metaphlan <- metaphlan_df2 %>%
  left_join(metaphlan_df_sumry %>% select(sampleID, diversity_group)) %>% 
  ungroup() %>% 
  mutate(Genus = factor(Genus, levels = unique(Genus))) %>% 
  group_by(sampleID) %>% 
  arrange(Genus) %>% 
  ggplot(aes(x=reorder(sampleID, Shannon),y=pctseqs)) +
  geom_bar(stat="identity",aes(fill=Genus), width = 0.9) +
  scale_fill_manual(values = metaphlan_pal2) +
  theme_bw() +
  theme(legend.position = "none",
        axis.text.x=eb(),
        axis.ticks.x=eb(),
        strip.text.x= et(angle=0,size=14),
        strip.background = eb(),
        axis.title.y = et(color = "black", size = 14),
        axis.text.y = et(color = "black", size = 12),
        panel.spacing = unit(0.5, "lines"),
        plot.margin = margin(t = 5,
                             r = 5, 
                             b = 0, 
                             l = 5)) +
  facet_grid(. ~diversity_group, scales = "free", space = "free")+
  scale_y_continuous(expand = expansion(mult = c(0.005,0.005)),
                     labels = scales::percent_format(accuracy = 1)) +
  ylab("MetaPhlAn4 Relative Abundance") +
  xlab("")


# Color facets
gg_metaphlan_grob <- ggplot_gtable(ggplot_build(gg_metaphlan))
strip_both <- which(grepl('strip-', gg_metaphlan_grob$layout$name))
fills <- diversity_group_colors
k <- 1
for (i in strip_both) {
  l <- which(grepl('titleGrob', gg_metaphlan_grob$grobs[[i]]$grobs[[1]]$childrenOrder))
  gg_metaphlan_grob$grobs[[i]]$grobs[[1]]$children[[l]]$children[[1]]$gp$col <- fills[k]
  k <- k+1
}

gg_shannon <- metaphlan_df_sumry %>% 
  ggplot(aes(x=reorder(sampleID, Shannon), y = Shannon)) +
  geom_bar(stat="identity",aes(fill=diversity_group_abv), width = 0.9) +
  theme_bw() +
  theme(legend.position = "none",
        axis.text.x = eb(),
        axis.title.x = eb(),
        axis.ticks.x = eb(),
        strip.text = eb(),
        strip.background = er(fill = "white"),
        axis.title.y = et(color = "black", size = 14),
        axis.text.y = et(color = "black", size = 12),
        panel.spacing = unit(0.5, "lines"),
        plot.margin = margin(t = 0,
                             r = 5, 
                             b = 0, 
                             l = 5),
        panel.grid = eb()) +
  scale_fill_manual(values = diversity_group_colors) +
  facet_grid(. ~diversity_group, scales = "free", space = "free")

gg_metaphlan_shannon <-
plot_grid(
  gg_metaphlan_grob,
  gg_shannon,
  axis = "lb",
  align = "hv",
  nrow = 2,
  rel_heights = c(1, 0.15)
)

pdf(file = "./Results/Figure_1A.pdf", width = 12.25, height = 8)
gg_metaphlan_shannon 
invisible(dev.off())

#### Combine all into a single figure 1 Start ####
alpha_org_plot <- plot_grid(
  gg_alpha_invsim + theme(legend.position = "none", 
                          axis.text.x = eb(), 
                          axis.title.x = eb(),
                          plot.margin = unit(c(0.1, 0, 0.15, 0), "cm")),
  gg_alpha_shannon + theme(legend.position = "none", 
                           axis.text.x = eb(), 
                           axis.title.x = eb(),
                           plot.margin = unit(c(0.1, 0, 0.15, 0), "cm")),
  gg_alpha_richness+ theme(legend.position = "none", 
                           axis.text.x = eb(),
                           axis.title.x = eb(),
                           plot.margin = unit(c(0.1, 0, 0.15, 0), "cm")),
  gg_lach_rel_abd + theme(axis.text.x = eb(),
                          axis.title.x = eb(),
                          plot.margin = unit(c(0.1, 0, 0.15, 0), "cm")),
  gg_ecoc_rel_abd + theme(legend.position = "none", 
                          axis.text.x = et(angle = 45, hjust = 0.85, vjust = 0.85), 
                          plot.margin = unit(c(0.05, 0, 0, 0), "cm")),
  gg_ebac_rel_abd + theme(legend.position = "none", 
                          axis.text.x = et(angle = 45, hjust = 0.85, vjust = 0.85), 
                          plot.margin = unit(c(0.05, 0, 0, 0), "cm")),
  gg_bact_rel_abd + theme(legend.position = "none", 
                          axis.text.x = et(angle = 45, hjust = 0.85, vjust = 0.85), 
                          plot.margin = unit(c(0.05, 0, 0, 0), "cm")),
  gg_oscl_rel_abd + theme(axis.text.x = et(angle = 45, hjust = 0.85, vjust = 0.85), 
                          plot.margin = unit(c(0.05, 0, 0, 0), "cm")),
  nrow = 2,
  axis = "lb",
  align = "v",
  rel_widths = c(1, 1, 1, 1.3),
  rel_heights = c(1,1.2)
)

# This is useful to figure out the general layout of the plots
gs <- lapply(1:4, function(ii)
  grobTree(rectGrob(gp=gpar(fill=ii, alpha=0.5)), textGrob(ii)))

grid.arrange(grobs=gs, ncol=4,
               top="top label", bottom="bottom\nlabel",
               left="left label", right="right label")

lay <- rbind(c(1,1,1,1,1,3,3,2),
             c(1,1,1,1,1,3,3,2),
             c(4,4,4,4,4,4,NA,NA),
             c(4,4,4,4,4,4,4,4),
             c(4,4,4,4,4,4,4,4))

grid.arrange(grobs = gs, layout_matrix = lay)

pdf(file = "./Results/Figure_1.pdf", height = 16, width = 20)

grid.arrange(
  gg_metaphlan_shannon,                        # 1
  gg_tax_legend,                               # 3
  tax_umap_mat_plot + 
    theme(
      legend.text = et(size = 10),
      legend.title = et(size = 12),
      plot.margin = margin(
        t = 5,  # Top margin
        r = 0,  # Right margin
        b = 75, # Bottom margin
        l = 5   # Left margin
      )
    ),                                      # 2
  alpha_org_plot,                           # 4
  layout_matrix = lay
)

invisible(dev.off())


grid.arrange(
  gg_metaphlan_shannon,                        # 1
  gg_tax_legend,                               # 2
  tax_umap_mat_plot + 
    theme(
      legend.text = et(size = 10),
      legend.title = et(size = 12),
      plot.margin = margin(
         t = 5,  # Top margin
        r = 0,  # Right margin
        b = 75, # Bottom margin
        l = 5   # Left margin
      )
    ),                                      # 3
  alpha_org_plot,                           # 4
  layout_matrix = lay
)

#### Combine all into a single figure 1 End ####

Figure 2: Qualitative Metabolomics Heatmap

heatmap_data <- 
  t_metaphlan %>% 
  drop_na(taxid) %>% 
  select(sampleID) %>% 
  group_by(sampleID) %>% 
  dplyr::slice(1) %>% 
  mutate(db = ifelse(grepl(sampleID, pattern = "lt"), "Liver Transplant", "Healthy Donor")) %>%
  left_join(metab_qual_anon) %>% 
  mutate(compound = ifelse(compound == "isovaleric-acid", "isovalerate", compound),
         compound = str_to_title(compound),
         compound = recode(compound,
                          Preq1 = "PreQ1")) %>% 
  filter(compound %in% heatmap_cmpds$compound|is.na(compound)) %>% 
  group_by(compound) %>% 
  mutate(median_val = ifelse(median(mvalue, na.rm = TRUE) == 0, min(mvalue[mvalue > 0], na.rm = TRUE)/10, median(mvalue, na.rm = TRUE)),
         heatmap_val = ifelse(log(mvalue/ median_val, base = 2) == -Inf, 0, log(mvalue/ median_val, base = 2))) %>% 
  ungroup() %>% 
  select(-c(mvalue, median_val)) %>% 
  group_by(sampleID, compound) %>% 
  slice_max(heatmap_val, with_ties = F, n = 1) %>% 
  ungroup() %>%
  select(-db) %>% 
  left_join(metaphlan_df2 %>%
               left_join(metaphlan_df_sumry %>% select(sampleID, db, diversity_group_abv)) %>%
               distinct(sampleID, db, diversity_group_abv), by = "sampleID") %>% 
  group_by(sampleID, compound) %>% 
  slice_max(heatmap_val, with_ties = F, n = 1) %>% 
  left_join(alpha_shannon) %>% 
  group_by(db) %>% 
  arrange(db, Shannon) %>% 
  ungroup()

# Stats
heatmap_pvals <-
  heatmap_data %>%
  filter(diversity_group_abv != "Healthy Donor") %>% 
  drop_na(compound) %>%
  group_by(compound) %>%
  rstatix::kruskal_test(heatmap_val~diversity_group_abv) %>%
  rstatix::adjust_pvalue(method = "BH") %>%
  select(compound, statistic, p, p.adj) 

heatmap_labels <- 
  heatmap_data %>% 
  mutate(compound = str_to_title(compound)) %>% 
  left_join(heatmap_lookup) %>%
  left_join(heatmap_pvals %>% group_by(compound, p.adj, p) %>% slice(1)) %>%
  arrange(class, subclass, p) %>%
  arrange(class, subclass) %>% 
  pivot_wider(c(diversity_group_abv, sampleID, db, Shannon), names_from = "compound", values_from = "heatmap_val", values_fn = mean) %>% 
  group_by(db) %>% 
  arrange(db, Shannon) %>% 
  ungroup() %>% 
  distinct(sampleID, .keep_all = TRUE) %>% 
  select(diversity_group_abv) %>% 
  mutate(
    diversity_group_abv = as.character(diversity_group_abv),
    diversity_group_abv = ifelse(
      grepl(pattern = "Healthy", as.character(diversity_group_abv)),
      diversity_group_abv,
      paste(diversity_group_abv, "Diversity")
    ),
    diversity_group_abv = factor(diversity_group_abv, levels = c("Low Diversity", "Medium Diversity",
                                                                 "High Diversity", "Healthy Donor"))
  )
  
# Build heatmap compound order (row order) and compound class (row slice)
heatmap_order <- 
  heatmap_data %>% 
  ungroup() %>% 
  filter(compound %in% heatmap_cmpds$compound) %>% 
  distinct(compound) %>% 
  drop_na() %>% 
  left_join(heatmap_lookup) %>% 
  left_join(heatmap_pvals %>% group_by(compound, p.adj, p) %>% slice(1)) %>%
  ungroup() %>% 
  mutate(class = factor(
    class,
    levels = c(
      "Fatty Acid",                   # 1
      "Dicarboxylic Acid",            # 2
      "Amino Acid",                   # 3
      "Bile Acid",                    # 4
      "Indole",                       # 5
      "Phenolic Aromatic",            # 6
      "Kynurine Pathway",             # 7
      "Vitamin"                       # 8
    )
  ),
  subclass = factor(
    subclass,
    levels = c(
      "Short-Chain Fatty Acid",       # 1
      "Branched-Chain Fatty Acid",    # 2
      "Aminated Fatty Acid",          # 3
      "Long-Chain Fatty Acid",        # 4
      "Dicarboxylic Acid",            # 5
      "Amino Acid",                   # 6
      "Primary Bile Acid",            # 7
      "Secondary Bile Acid",          # 8
      "Conjugated Bile Acid",         # 9
      "Indole",                       # 10
      "Phenolic Aromatic",            # 11
      "Kynurine Pathway",             # 12
      "Vitamin"                       # 13
    )
  )) %>% 
  arrange(class, subclass, p, compound) %>%
  select(class, subclass, p, compound)
  
# Build heatmap patient order following same order as gg_metaphlan plot
heatmap_column_order <- heatmap_data %>% 
  group_by(patientID) %>% 
  dplyr::slice(1) %>% 
  left_join(alpha_shannon) %>% 
  group_by(db) %>% 
  arrange(db, Shannon) %>%  
  select(patientID) %>% 
  distinct(patientID) %>% 
  pull(patientID)

# P-value legend color
pvalue_col_fun = colorRamp2(c(0, 0.045), c("#75C236", "#E3E4E6"))

# Create heatmap for adjusted p-values
pvalue_adj <-
heatmap_pvals %>% 
  group_by(compound, p.adj, p) %>% 
  dplyr::slice(1) %>% 
  ungroup() %>% 
  left_join(heatmap_lookup) %>%
  mutate(class = factor(
    class,
    levels = c(
      "Fatty Acid",                   # 1
      "Dicarboxylic Acid",            # 2
      "Amino Acid",                   # 3
      "Bile Acid",                    # 4
      "Indole",                       # 5
      "Phenolic Aromatic",            # 6
      "Kynurine Pathway",             # 7
      "Vitamin"                       # 8
    )
  ),
  subclass = factor(
    subclass,
    levels = c(
      "Short-Chain Fatty Acid",       # 1
      "Branched-Chain Fatty Acid",    # 2
      "Aminated Fatty Acid",          # 3
      "Long-Chain Fatty Acid",        # 4
      "Dicarboxylic Acid",            # 5
      "Amino Acid",                   # 6
      "Primary Bile Acid",            # 7
      "Secondary Bile Acid",          # 8
      "Conjugated Bile Acid",         # 9
      "Indole",                       # 10
      "Phenolic Aromatic",            # 11
      "Kynurine Pathway",             # 12
      "Vitamin"                       # 13
    )
  )) %>% 
  arrange(class, subclass, p.adj, compound) %>%
  select(class, subclass, p.adj, compound) %>% 
  column_to_rownames(var = "compound") %>% 
  select(`Adjusted p-value` = p.adj) %>% 
  as.matrix() %>% 
Heatmap(name = "Significance (adjusted p-value)",
        cluster_rows = FALSE,
        cluster_columns = FALSE,
        col = pvalue_col_fun,        
        column_title = "KW Test",
        column_title_side = "bottom",
        column_title_rot = 90,
        show_column_names = FALSE,
        column_names_side = "top",
        row_names_gp = gpar(fontsize = 8),
        rect_gp = gpar(col = "black", lwd = 0.2),
        row_gap = unit(3.5, "mm"),
        row_names_side = c("right"),
        row_order = heatmap_order$compound,
        row_split = heatmap_order$subclass,
        show_row_names = FALSE,
        row_title_rot = 0,
        heatmap_legend_param = list(at = seq(0.05, 0, -0.01)),
        width = unit(0.1, "in")
)

# Heatmap legend color
col_fun <- colorRamp2(breaks = c(-5, 0, 5), colors = c("#00aaad", "white", "#ad003a"))

# Global parameter for annotation 
# ht_opt$COLUMN_ANNO_PADDING <- unit(2.5, "mm")

gg_metab_heatmap <-
  heatmap_data %>% 
  mutate(compound = str_to_title(compound),
         compound = recode(compound,
                          Preq1 = "PreQ1")) %>%
  left_join(heatmap_lookup) %>% 
  left_join(heatmap_pvals %>% group_by(compound, p.adj, p) %>% slice(1) %>% select(compound, p.adj, p), by = "compound") %>%
  mutate(class = factor(
    class,
    levels = c(
      "Fatty Acid",                   # 1
      "Dicarboxylic Acid",            # 2
      "Amino Acid",                   # 3
      "Bile Acid",                    # 4
      "Indole",                       # 5
      "Phenolic Aromatic",            # 6
      "Kynurine Pathway",             # 7
      "Vitamin"                       # 8
    )
  ),
  subclass = factor(
    subclass,
    levels = c(
      "Short-Chain Fatty Acid",       # 1
      "Branched-Chain Fatty Acid",    # 2
      "Aminated Fatty Acid",          # 3
      "Long-Chain Fatty Acid",        # 4
      "Dicarboxylic Acid",            # 5
      "Amino Acid",                   # 6
      "Primary Bile Acid",            # 7
      "Secondary Bile Acid",          # 8
      "Conjugated Bile Acid",         # 9
      "Indole",                       # 10
      "Phenolic Aromatic",            # 11
      "Kynurine Pathway",             # 12
      "Vitamin"                       # 13
    )
  )) %>%
  mutate(
    diversity_group_abv = as.character(diversity_group_abv),
    diversity_group_abv = ifelse(
      grepl(pattern = "Healthy", as.character(diversity_group_abv)),
      diversity_group_abv,
      paste(diversity_group_abv, "Diversity")
    ),
    diversity_group_abv = factor(diversity_group_abv, levels = c("Low Diversity", "Medium Diversity",
                                                                 "High Diversity", "Healthy Donor"))
  ) %>% 
  arrange(class, subclass, p.adj, compound) %>%
  pivot_wider(c(diversity_group_abv, db, patientID, Shannon), names_from = compound, values_from = heatmap_val) %>%
  group_by(patientID) %>% 
  arrange(patientID) %>% 
  ungroup() %>% 
  select(-`NA`) %>%
  distinct(patientID, .keep_all = TRUE) %>%
  group_by(diversity_group_abv) %>% 
  arrange(db, Shannon) %>% 
  ungroup() %>% 
  select(-Shannon, -db, -diversity_group_abv) %>% 
  column_to_rownames("patientID") %>%
  as.matrix() %>% 
  t() %>%
  Heatmap(
    name = "Fold Change (log2)",
    col = col_fun,
    na_col = "grey83",
    rect_gp = gpar(col = "grey40", lwd = 1.5),
    column_names_gp = grid::gpar(fontsize = 8),
    column_gap = unit(2.5, "mm"),
    column_split = heatmap_labels,
    column_order = heatmap_column_order,
    column_title_gp = gpar(fontsize = 14),
    column_title_rot = 0,
    cluster_columns = FALSE,
    show_column_names = TRUE,
    show_column_dend = FALSE,
    row_names_gp = gpar(fontsize = 8),
    row_gap = unit(3.5, "mm"),
    row_names_side = c("left"),
    row_order = heatmap_order$compound,
    row_split = heatmap_order$subclass,
    row_title_rot = 0,
    cluster_rows = FALSE,
    show_row_dend = FALSE,
    heatmap_height = unit(13, "in"),
    heatmap_width =  unit(19, "in")
  )

gg_metab_heatmap_tot <- gg_metab_heatmap + pvalue_adj
gg_metab_heatmap_tot

pdf(file = "./Results/Figure_2.pdf", height = 15, width = 26, onefile = F)
gg_metab_heatmap_tot
invisible(dev.off())

Figure 3: Quant Metabolomics-Healthy Donor vs LT Patients

# Conversions of compounds
metab_conversions <- data.frame(
  compound = c(
    "Taurocholic Acid",
    "Glycocholic Acid",
    "Cholic Acid",
    "3-Oxolithocholic Acid",
    "Alloisolithocholic Acid",
    "Deoxycholic Acid",
    "Isodeoxycholic Acid",
    "Lithocholic Acid",
    "Kynurenic Acid",
    "Kynurenine",
    "Anthranilic Acid",
    "Desaminotyrosine",
    "Niacin",
    "Tyrosine",
    "Tryptamine",
    "Tryptophan",
    "Phenylalanine",
    "Acetate",
    "Butyrate",
    "Propionate",
    "Succinate"
  ),
  molar_mass__gmol = c(
    515.7,
    465.6,
    408.6,
    374.6,
    376.6,
    392.6,
    392.6,
    376.6,
    189.17,
    208.21,
    137.14,
    166.17,
    123.11,
    181.19,
    160.22,
    204.22,
    165.19,
    59.04,
    87.1,
    73.07,
    116.07
  )
)

metab_quant_converted <- metab_quant_anon %>%
  mutate(
    compound = ifelse(compound == "isovaleric-acid", "isovalerate", compound),
    compound = str_to_title(compound)
  ) %>%
  mutate(units = case_when(compound %in% c(
    "Taurocholic Acid",
    "Glycocholic Acid",
    "Cholic Acid",
    "3-Oxolithocholic Acid",
    "Alloisolithocholic Acid",
    "Deoxycholic Acid",
    "Isodeoxycholic Acid",
    "Lithocholic Acid"
  ) ~ "ugmL",
  compound %in% c("Acetate", "Butyrate", "Succinate", "Propionate") ~ "mM",
  TRUE ~ "uM")) %>% 
  right_join(metab_conversions, by = "compound") %>% # right_join to only keep compounds we're interested in
  mutate(mvalue__mM = case_when(units == "ugmL" ~ (mvalue*               #ugmL
                                                  ((1000/                #1000 ml per L
                                                    1000000)/            #1000000 ug per gram
                                                    molar_mass__gmol)*   #gram per mol (molar mass)
                                                    1000                 #1000 mM per 1M                                            
                                                   ),
                                units == "uM" ~ mvalue/                  #uM
                                                1000,                    #1000uM per 1M
                                TRUE ~ mvalue #units are already in mM
                                )
         ) %>% 
  select(sampleID, compound, mvalue__mM)
  

metab_boxplot <- 
  metaphlan_df2 %>% 
  left_join(metaphlan_df_sumry) %>% 
  drop_na(taxid) %>% 
  select(sampleID, diversity_group_abv, db) %>% 
  group_by(sampleID) %>% 
  dplyr::slice(1) %>% 
  left_join(metab_quant_converted) %>% 
  mutate(compound = ifelse(compound == "isovaleric-acid", "isovalerate", compound),
         compound = str_to_title(compound)) %>% 
  ungroup() %>%
  mutate(class = case_when(compound %in% c("Taurocholic Acid", "Glycocholic Acid") ~ "Conjugated Primary Bile Acid",
                           compound %in% c("Cholic Acid") ~ "Primary Bile Acid",
                           compound %in% c("3-Oxolithocholic Acid", "Alloisolithocholic Acid", "Deoxycholic Acid", "Isodeoxycholic Acid",
                                                "Lithocholic Acid") ~ "Secondary Bile Acid",
                           compound %in% c("Threonine", "Glycine", "Tyrosine", "Tyramine", "Serine", "Leucine", "Isoleucine",
                                               "Valine", "Phenylalanine", "Alanine", "Proline", "Aspartate", 
                                               "Methionine", "Glutamate", "Lysine", "Cysteine", "Tryptophan") ~ "Amino Acid",
                           compound %in% c("Acetate", "Butyrate", "Succinate", "Propionate") ~ "Short-Chain Fatty Acid",
                           compound %in% c("Kynurenic Acid", "Anthranilic Acid", "Kynurenine", "Tryptamine") ~ "Kynurenine Metabolite",
                           compound == "Desaminotyrosine" ~ "Phenolic Aromatic",
                           compound == "Niacin" ~ "B-Vitamin",
                              TRUE ~ "Indole"),
         compound = case_when(class == "Conjugated Primary Bile Acid" ~ paste(compound, "(1˚Conj. BA)"),
                              class == "Primary Bile Acid" ~ paste(compound, "(1˚ BA)"),
                              class == "Secondary Bile Acid" ~ paste(compound, "(2˚ BA)"),
                              class == "Short-Chain Fatty Acid" ~ paste(compound, "(SCFA)"),
                              class == "Amino Acid" ~ paste(compound, "(AA)"),
                              class == "Phenolic Aromatic" ~ paste(compound, "(Phen. Arom.)"),
                              class == "Indole" ~ paste0(compound, "(Indole)"),
                              class == "Kynurenine Metabolite" ~ paste(compound, "(Kyn. Metab.)"),
                              class == "B-Vitamin" ~ paste(compound, "(B-Vitamin)")
                              )) %>% 
  drop_na() %>% 
  group_by(compound) %>% 
  mutate(count = length(unique(diversity_group_abv))) %>% 
  filter(count == 4) %>% 
  select(-count) %>%
  mutate(compound = factor(
    compound,
    levels = c(
      "Acetate (SCFA)",                  # 1
      "Propionate (SCFA)",               # 2
      "Butyrate (SCFA)",                 # 3
      "Succinate (SCFA)",                # 4
      "Tyrosine (AA)",                   # 5
      "Tryptophan (AA)",                 # 6
      "Phenylalanine (AA)",              # 7
      "Cholic Acid (1˚ BA)",             # 8
      "Glycocholic Acid (1˚Conj. BA)",   # 9
      "Taurocholic Acid (1˚Conj. BA)",   # 10
      "Deoxycholic Acid (2˚ BA)",        # 11
      "Lithocholic Acid (2˚ BA)",        # 12
      "Isodeoxycholic Acid (2˚ BA)",     # 13
      "3-Oxolithocholic Acid (2˚ BA)",   # 14
      "Alloisolithocholic Acid (2˚ BA)", # 15
      "Desaminotyrosine (Phen. Arom.)",  # 16
      "Kynurenine (Kyn. Metab.)",        # 17
      "Anthranilic Acid (Kyn. Metab.)",  # 18
      "Tryptamine (Kyn. Metab.)",        # 19
      "Niacin (B-Vitamin)"               # 20
    )
  ),
  class = factor(
    class,
    levels = c(
      "Short-Chain Fatty Acid",          # 1
      "Amino Acid",                      # 2
      "Primary Bile Acid",               # 3
      "Conjugated Primary Bile Acid",    # 4
      "Secondary Bile Acid",             # 5
      "Indole",                          # 6
      "Phenolic Aromatic",               # 7
      "Kynurenine Metabolite",           # 8
      "B-Vitamin"                        # 9
    )
  )) %>% 
  filter(compound != "Kynurenic Acid") %>% 
  mutate(db = factor(db, levels = c("Liver Transplant", "Healthy Donor")),
         diversity_group_abv = factor(diversity_group_abv, levels = c("Low", "Medium", "High", "Healthy Donor"))) %>% 
  filter(compound %in% c(
    "Acetate (SCFA)",
    "Butyrate (SCFA)",
    "Propionate (SCFA)",
    "Glycocholic Acid (1˚Conj. BA)",
    "Taurocholic Acid (1˚Conj. BA)",
    "Cholic Acid (1˚ BA)",
    "Deoxycholic Acid (2˚ BA)",
    "Lithocholic Acid (2˚ BA)",
    "Alloisolithocholic Acid (2˚ BA)"
    ))


metab_boxplot_stats <-
  metab_boxplot %>% 
  group_by(class, compound) %>% 
  rstatix::wilcox_test(mvalue__mM~diversity_group_abv,
                       comparisons = diversity_comps,
                       p.adjust.method = "none",
                       alternative= "two.sided") %>% 
  rstatix::adjust_pvalue(method = "BH") %>%
  rstatix::add_significance("p.adj") %>% 
  mutate(p.adj = ifelse(p.adj < 0.001, "p.adj < 0.001", paste("p.adj = ", round(p.adj, 2)))) %>% 
  mutate(y.position = case_when(group1 == "High" & group2 == "Healthy Donor" ~ 2.30,
                                group1 == "Medium" & group2 == "High" ~ 2.7,
                                group1 == "Low" & group2 == "High" ~ 3.1,
                                group1 == "Low" & group2 == "Medium" ~ 3.5)) # Set stats brackets to fixed points since the y-scale will be log10 transformed


set.seed(123) # for consistent jittering of points
gg_metab_boxplot <-
ggboxplot(metab_boxplot,
          x = "diversity_group_abv",
          y = "mvalue__mM",
          fill = "db",
          color = "diversity_group_abv",
          alpha = 0.65,
          outlier.shape = NA,
          facet.by = c("class", "compound")) +
    theme(legend.text = et(size = 12, color = "black"),
          legend.title = et(size = 14, color = "black"),
          axis.title.x = eb(),
          axis.title.y = et(size = 12, color = "black"),
          panel.border = eb(),
          strip.background = er(colour="white", fill="white"),
          ) +
    geom_hline(yintercept = 0) + 
    geom_segment(aes(x = 0.35, y = 0, xend = 0.35, yend = Inf)) +
    facet_wrap(~compound, scales = "fixed") +
    stat_pvalue_manual(metab_boxplot_stats, 
                       tip.length = 0.015) +
  geom_point(
    data = metab_boxplot,
    aes(x = diversity_group_abv, y = mvalue__mM, color = diversity_group_abv),
    position = position_jitter(width = 0.2),
    size = 2,
    alpha = 0.65) +
    scale_fill_manual("Cohort", values = rev(pirate_colors)) +
    scale_color_manual("Diversity Group", values = diversity_group_colors) +
    scale_y_log10(limits = c(0.01, 5000),
                  labels = c(0.01, 0.1, 1, 10, 100, 1000),
                  breaks = c(0.01, 0.1, 1, 10, 100, 1000),
                  expand = expansion(mult = c(0.1, 0.2))) +
    ylab("Concentration (mM)\n")

gg_metab_boxplot

cairo_pdf(filename = "./Results/Figure_3.pdf", width = 14, height = 10, onefile = FALSE)

gg_metab_boxplot

invisible(dev.off())

Figure 4A: Enterococcus Expansion and Infection

gg_ecoc_all_patients <- peri_matrix_all %>%
    distinct(sampleID, .keep_all = T) %>%
    select(sampleID, patientID, bact_infection_present) %>%
    mutate(bact_infection_present = ifelse(bact_infection_present ==
        "No", "No Infection", "Infection"), bact_infection_present = factor(bact_infection_present,
        levels = c("Infection", "No Infection"))) %>%
    left_join(metaphlan_peri_anon %>%
        select(-bact_infection_present) %>%
        group_by(patientID, sampleID) %>%
        filter(grepl(x = Species, pattern = "Enterococcus", ignore.case = T)) %>%
        count(sampleID, wt = pctseqs, name = "enterococcus_rel_abundance")) %>%
    left_join(metaphlan_peri_anon %>%
        select(patientID, sampleID, Kingdom:pctseqs)) %>%
    ungroup() %>%
    arrange(Kingdom, Phylum, Class, Order, Family, Genus) %>%
    mutate(Genus = paste0(Phylum, "-", Order, "-", Family, "-",
        Genus)) %>%
    group_by(sampleID) %>%
    arrange(Genus) %>%
    mutate(cum.pct = cumsum(pctseqs), y.text = (cum.pct + c(0,
        cum.pct[-length(cum.pct)]))/2) %>%
    ungroup() %>%
    dplyr::select(-cum.pct) %>%
    mutate(sampleID = ifelse(is.na(sampleID), patientID, sampleID),
        Genus = factor(Genus, levels = unique(Genus)), Genus = fct_relevel(Genus,
            "Firmicutes-Lactobacillales-Enterococcaceae-Enterococcus",
            after = Inf)) %>%
    arrange(-enterococcus_rel_abundance) %>%
    group_by(sampleID) %>%
    ggplot(aes(x = reorder(sampleID, -enterococcus_rel_abundance),
        y = pctseqs)) + geom_bar(aes(fill = Genus), stat = "identity") +
    theme_bw() + theme(legend.position = "none", axis.text.x = et(angle = 90,
    hjust = 0.5), strip.text.x = et(angle = 0, size = 14), strip.background = eb(),
    axis.title.y = et(color = "black", size = 12), axis.text.y = et(color = "black",
        size = 10)) + scale_fill_manual(values = metaphlan_pal2) +
    scale_y_continuous(expand = expansion(mult = c(0.005, 0.005))) +
    ylab("Shotgun Relative Abundance (%)\n") + xlab("") + facet_grid(~bact_infection_present,
    space = "free_x", scales = "free_x")
# gg_ecoc_all_patients


# Discrete heatmap of infections
ecoc_infx_orgs <- peri_criteria_all %>%
    filter(sampleID %in% peri_matrix_all$sampleID) %>%
    group_by(patientID, eday) %>%
    arrange(-infx_stool) %>%
    dplyr::slice(1) %>%
    ungroup() %>%
    select(patientID, sampleID, bact_infection_present, infx_stool,
        organism1, micro1.factor) %>%
    distinct() %>%
    mutate(organism1 = gsub(x = organism1, pattern = "\\s+",
        replacement = ""), organism1 = str_to_lower(string = organism1),
        organism1 = ifelse(grepl(x = organism1, pattern = "enterococcus|enterobacterales|klebsiella|escherichia|citrobacter|proteus|staphyl|clostrid|pseudo|steno|bacteroides|helico"),
            organism1, "Culture Negative")) %>%
    group_by(patientID, sampleID, infx_stool) %>%
    mutate(`Enterococcus faecium` = grepl(x = organism1, pattern = "enterococcusfaecium",
        ignore.case = T), `Enterococcus faecalis` = grepl(x = organism1,
        pattern = "enterococcusfaecalis", ignore.case = T), `Enterococcus avium` = grepl(x = organism1,
        pattern = "enterococcusavium", ignore.case = T), `Enterococcus gallinarum` = grepl(x = organism1,
        pattern = "enterococcusgallinarum", ignore.case = T),
        `Klebsiella pneumoniae` = grepl(x = organism1, pattern = "klebsiellapneumoniae",
            ignore.case = T), `Enterobacter cloaceae` = grepl(x = organism1,
            pattern = "enterobactercloaceae", ignore.case = T),
        `Escherichia coli` = grepl(x = organism1, pattern = "escherichiacoli",
            ignore.case = T), `Citrobacter freundii` = grepl(x = organism1,
            pattern = "citrobacterfreundii", ignore.case = T),
        `Proteus mirabilis` = grepl(x = organism1, pattern = "proteusmirabilis",
            ignore.case = T), `Staphylococcus aureus` = grepl(x = organism1,
            pattern = "staphylococcusaureus", ignore.case = T),
        `Staphylococcus epidermis` = grepl(x = organism1, pattern = "staphylococcusepidermidis",
            ignore.case = T), `Pseudomonas aeruginosa` = grepl(x = organism1,
            pattern = "pseudomonasaeruginosa", ignore.case = T),
        `Stenotrophmonas maltophilia` = grepl(x = organism1,
            pattern = "stenotrophmonasmaltophilia", ignore.case = T),
        `Helicobacter pylori` = grepl(x = organism1, pattern = "helicobacterpylori",
            ignore.case = T), `Clostridium difficile` = grepl(x = organism1,
            pattern = "clostridiumdifficile|clostridioidesdifficile",
            ignore.case = T), `Bacteroides sp.` = grepl(x = organism1,
            pattern = "bacteroides", ignore.case = T), `Culture Negative` = grepl(x = organism1,
            pattern = "Culture Negative", ignore.case = T)) %>%
    pivot_longer(-c(patientID:micro1.factor), names_to = "organisms",
        values_to = "org_presence") %>%
    mutate(organisms = ifelse(bact_infection_present == "No",
        "No Bacterial Infection", organisms), org_presence = ifelse(org_presence ==
        TRUE, 1, 0)) %>%
    group_by(sampleID, infx_stool, bact_infection_present, organisms) %>%
    dplyr::slice_max(org_presence) %>%
    ungroup() %>%
    filter(org_presence == 1) %>%
    left_join(metaphlan_peri_anon %>%
        select(-bact_infection_present) %>%
        group_by(patientID, sampleID) %>%
        filter(grepl(x = Species, pattern = "Enterococcus", ignore.case = T)) %>%
        count(sampleID, wt = pctseqs, name = "enterococcus_rel_abundance")) %>%
    left_join(metaphlan_peri_anon %>%
        select(patientID, sampleID, Kingdom:pctseqs)) %>%
    group_by(patientID, sampleID, organisms, org_presence) %>%
    dplyr::slice(1) %>%
    mutate(sampleID = ifelse(is.na(sampleID), patientID, sampleID)) %>%
    mutate(org_presence = ifelse(grepl(pattern = "No", x = bact_infection_present,
        ignore.case = T), 0, org_presence)) %>%
    ungroup() %>%
    mutate(organisms = ifelse(bact_infection_present == "Yes" &
        org_presence == 0, "Other", organisms)) %>%
    arrange(-enterococcus_rel_abundance) %>%
    mutate(organisms = ifelse(grepl(x = organisms, pattern = "enterococcus|klebsiella|escherichia|proteus|citrobacter|culture",
        ignore.case = TRUE), organisms, "Other Bacterial Infection"),
        organisms = ifelse(bact_infection_present == "No", "No Bacterial Infection",
            organisms))

ecoc_infx_orgs_order <- ecoc_infx_orgs %>%
    group_by(sampleID, patientID, organisms) %>%
    distinct(sampleID, patientID, .keep_all = T) %>%
    ungroup() %>%
    mutate(organisms = ifelse(grepl(organisms, pattern = "gallinarum"),
        "Other Bacterial Infection", organisms), org_colors = case_when(grepl(x = organisms,
        pattern = "enterococcus faecium", ignore.case = T) ~
        1, grepl(x = organisms, pattern = "enterococcus faecalis",
        ignore.case = T) ~ 2, grepl(x = organisms, pattern = "enterococcus avium",
        ignore.case = T) ~ 3, grepl(x = organisms, pattern = "klebsiella pneumoniae",
        ignore.case = T) ~ 4, grepl(x = organisms, pattern = "escherichia coli",
        ignore.case = T) ~ 5, grepl(x = organisms, pattern = "proteus mirabilis",
        ignore.case = T) ~ 6, grepl(x = organisms, pattern = "citrobacter freundii",
        ignore.case = T) ~ 7, grepl(x = organisms, pattern = "other bacterial infection|gallinarum",
        ignore.case = T) ~ 8, grepl(x = organisms, pattern = "culture negative",
        ignore.case = T) ~ 9, TRUE ~ 0), organisms = as.factor(organisms),
        organisms = factor(organisms, levels = c("Enterococcus faecium",
            "Enterococcus faecalis", "Enterococcus avium", "Klebsiella pneumoniae",
            "Escherichia coli", "Proteus mirabilis", "Citrobacter freundii",
            "Other Bacterial Infection", "Culture Negative",
            "No Bacterial Infection")), org_colors = factor(org_colors,
            levels = c("1", "2", "3", "4", "5", "6", "7", "8",
                "9", "0"))) %>%
    left_join(ecoc_infx_orgs) %>%
    mutate(organisms = factor(organisms, levels = c("Enterococcus faecium",
        "Enterococcus faecalis", "Enterococcus avium", "Klebsiella pneumoniae",
        "Escherichia coli", "Proteus mirabilis", "Citrobacter freundii",
        "Other Bacterial Infection", "Culture Negative", "No Bacterial Infection")),
        org_colors = factor(org_colors, levels = c("1", "2",
            "3", "4", "5", "6", "7", "8", "9", "0"))) %>%
    drop_na(organisms) %>%
    mutate(bact_infection_present = ifelse(bact_infection_present ==
        "No", "No Infection", "Infection"), bact_infection_present = factor(bact_infection_present,
        levels = c("Infection", "No Infection")))

gg_ecoc_infx_orgs <- ecoc_infx_orgs_order %>%
    droplevels() %>%
    ungroup() %>%
    ggplot(., aes(x = reorder(sampleID, -enterococcus_rel_abundance),
        y = organisms, fill = as.factor(org_colors))) + geom_tile(color = "black") +
    theme_bw() + theme(axis.title.y = et(color = "black", size = 12),
    axis.text.y = et(color = "black", size = 10), axis.ticks.x = eb(),
    axis.text.x = eb(), axis.title.x = eb(), panel.grid = eb(),
    strip.text = eb()) + scale_fill_manual(values = c("#129246",
    "#0C7A3A", "#08592B", "#FF0000", "#CC0404", "#8A0202", "#5C0202",
    "#E6C66E", "#BD992D", "#00000000"), labels = c("Enterococcus faecium",
    "Enterococcus faecalis", "Enterococcus avium", "Klebsiella pneumoniae",
    "Escherichia coli", "Citrobacter freundii", "Proteus mirabilis",
    "Other Bacterial Infection", "Culture Negative", "No Bacterial Infection")) +
    labs(fill = "Infecting Organism", y = "Infecting Organism\n") +
    scale_y_discrete(expand = expansion(mult = c(0.005, 0.005)),
        limits = rev(levels(ecoc_infx_orgs_order$organisms))) +
    facet_grid(. ~ bact_infection_present, space = "free_x",
        scales = "free_x")

# gg_ecoc_infx_orgs

yingtools2::gg.stack(gg_ecoc_all_patients, gg_ecoc_infx_orgs,
    heights = c(1, 0.5), adjust.themes = T)

pdf(file = "./Results/Figure_4A.pdf", height = 8, width = 15,
    onefile = F)

yingtools2::gg.stack(gg_ecoc_all_patients, gg_ecoc_infx_orgs,
    heights = c(1, 0.5), adjust.themes = T)

invisible(dev.off())

Figure 4B: Enterobacterales Expansion and Infection

gg_ebac_all_patients <- peri_matrix_all %>%
    distinct(sampleID, .keep_all = T) %>%
    select(sampleID, patientID, bact_infection_present) %>%
    mutate(bact_infection_present = ifelse(bact_infection_present ==
        "No", "No Infection", "Infection"), bact_infection_present = factor(bact_infection_present,
        levels = c("Infection", "No Infection"))) %>%
    left_join(metaphlan_peri_anon %>%
        select(-bact_infection_present) %>%
        group_by(patientID, sampleID) %>%
        filter(grepl(x = Species, pattern = "Klebsiella pneumoniae|Escherichia coli|Citrobacter freundii|Proteus mirabilis")) %>%
        count(sampleID, wt = pctseqs, name = "enterobacterales_rel_abundance")) %>%
    left_join(metaphlan_peri_anon %>%
        select(patientID, sampleID, Kingdom:pctseqs)) %>%
    ungroup() %>%
    arrange(Kingdom, Phylum, Class, Order, Family, Genus) %>%
    mutate(Genus = paste0(Phylum, "-", Order, "-", Family, "-",
        Genus)) %>%
    group_by(sampleID) %>%
    arrange(Genus) %>%
    mutate(cum.pct = cumsum(pctseqs), y.text = (cum.pct + c(0,
        cum.pct[-length(cum.pct)]))/2) %>%
    ungroup() %>%
    dplyr::select(-cum.pct) %>%
    mutate(sampleID = ifelse(is.na(sampleID), patientID, sampleID),
        Genus = factor(Genus, levels = unique(Genus)), Genus = fct_relevel(Genus,
            c("Proteobacteria-Enterobacterales-Enterobacteriaceae-Citrobacter",
                "Proteobacteria-Enterobacterales-Enterobacteriaceae-Enterobacter",
                "Proteobacteria-Enterobacterales-Enterobacteriaceae-Escherichia",
                "Proteobacteria-Enterobacterales-Enterobacteriaceae-Klebsiella",
                "Proteobacteria-Enterobacterales-Morganellaceae-Proteus"),
            after = Inf)) %>%
    arrange(-enterobacterales_rel_abundance) %>%
    ggplot(aes(x = reorder(sampleID, -enterobacterales_rel_abundance),
        y = pctseqs)) + geom_bar(aes(fill = Genus), stat = "identity") +
    theme_bw() + theme(legend.position = "none", axis.text.x = et(angle = 90,
    hjust = 0.5), strip.text.x = et(angle = 0, size = 14), strip.background = eb(),
    axis.title.y = et(color = "black", size = 12), axis.text.y = et(color = "black",
        size = 10)) + scale_fill_manual(values = metaphlan_pal2) +
    scale_y_continuous(expand = expansion(mult = c(0.005, 0.005))) +
    ylab("Shotgun Relative Abundance (%)\n") + xlab("") + facet_grid(~bact_infection_present,
    space = "free_x", scales = "free_x")
# gg_ebac_all_patients

# Discrete heatmap of infections
ebac_infx_orgs <- peri_criteria_all %>%
    filter(sampleID %in% peri_matrix_all$sampleID) %>%
    group_by(patientID, eday) %>%
    arrange(-infx_stool) %>%
    dplyr::slice(1) %>%
    ungroup() %>%
    select(patientID, sampleID, bact_infection_present, infx_stool,
        organism1, micro1.factor) %>%
    distinct() %>%
    mutate(organism1 = gsub(x = organism1, pattern = "\\s+",
        replacement = ""), organism1 = str_to_lower(string = organism1),
        organism1 = ifelse(grepl(x = organism1, pattern = "enterococcus|enterobacterales|klebsiella|escherichia|citrobacter|proteus|staphyl|clostrid|pseudo|steno|bacteroides|helico"),
            organism1, "Culture Negative")) %>%
    group_by(patientID, sampleID, infx_stool) %>%
    mutate(`Enterococcus faecium` = grepl(x = organism1, pattern = "enterococcusfaecium",
        ignore.case = T), `Enterococcus faecalis` = grepl(x = organism1,
        pattern = "enterococcusfaecalis", ignore.case = T), `Enterococcus avium` = grepl(x = organism1,
        pattern = "enterococcusavium", ignore.case = T), `Enterococcus gallinarum` = grepl(x = organism1,
        pattern = "enterococcusgallinarum", ignore.case = T),
        `Klebsiella pneumoniae` = grepl(x = organism1, pattern = "klebsiellapneumoniae",
            ignore.case = T), `Enterobacter cloaceae` = grepl(x = organism1,
            pattern = "enterobactercloaceae", ignore.case = T),
        `Escherichia coli` = grepl(x = organism1, pattern = "escherichiacoli",
            ignore.case = T), `Citrobacter freundii` = grepl(x = organism1,
            pattern = "citrobacterfreundii", ignore.case = T),
        `Proteus mirabilis` = grepl(x = organism1, pattern = "proteusmirabilis",
            ignore.case = T), `Staphylococcus aureus` = grepl(x = organism1,
            pattern = "staphylococcusaureus", ignore.case = T),
        `Staphylococcus epidermis` = grepl(x = organism1, pattern = "staphylococcusepidermidis",
            ignore.case = T), `Pseudomonas aeruginosa` = grepl(x = organism1,
            pattern = "pseudomonasaeruginosa", ignore.case = T),
        `Stenotrophmonas maltophilia` = grepl(x = organism1,
            pattern = "stenotrophmonasmaltophilia", ignore.case = T),
        `Helicobacter pylori` = grepl(x = organism1, pattern = "helicobacterpylori",
            ignore.case = T), `Clostridium difficile` = grepl(x = organism1,
            pattern = "clostridiumdifficile|clostridioidesdifficile",
            ignore.case = T), `Bacteroides sp.` = grepl(x = organism1,
            pattern = "bacteroides", ignore.case = T), `Culture Negative` = grepl(x = organism1,
            pattern = "Culture Negative", ignore.case = T)) %>%
    pivot_longer(-c(patientID:micro1.factor), names_to = "organisms",
        values_to = "org_presence") %>%
    mutate(organisms = ifelse(bact_infection_present == "No",
        "No Bacterial Infection", organisms), org_presence = ifelse(org_presence ==
        TRUE, 1, 0)) %>%
    group_by(sampleID, infx_stool, bact_infection_present, organisms) %>%
    dplyr::slice_max(org_presence) %>%
    ungroup() %>%
    filter(org_presence == 1) %>%
    left_join(metaphlan_peri_anon %>%
        select(-bact_infection_present) %>%
        group_by(patientID, sampleID) %>%
        filter(grepl(x = Species, pattern = "Klebsiella pneumoniae|Escherichia coli|Citrobacter freundii|Proteus mirabilis")) %>%
        count(sampleID, wt = pctseqs, name = "enterobacterales_rel_abundance")) %>%
    left_join(metaphlan_peri_anon %>%
        select(patientID, sampleID, Kingdom:pctseqs)) %>%
    group_by(patientID, sampleID, organisms, org_presence) %>%
    dplyr::slice(1) %>%
    mutate(sampleID = ifelse(is.na(sampleID), patientID, sampleID)) %>%
    mutate(org_presence = ifelse(grepl(pattern = "No", x = bact_infection_present,
        ignore.case = T), 0, org_presence)) %>%
    ungroup() %>%
    mutate(organisms = ifelse(bact_infection_present == "Yes" &
        org_presence == 0, "Other", organisms)) %>%
    arrange(-enterobacterales_rel_abundance) %>%
    mutate(organisms = ifelse(grepl(x = organisms, pattern = "enterococcus|klebsiella|escherichia|proteus|citrobacter|culture",
        ignore.case = TRUE), organisms, "Other Bacterial Infection"),
        organisms = ifelse(bact_infection_present == "No", "No Bacterial Infection",
            organisms))

ebac_infx_orgs_order <- ebac_infx_orgs %>%
    group_by(sampleID, patientID, organisms) %>%
    distinct(sampleID, patientID, .keep_all = T) %>%
    ungroup() %>%
    mutate(organisms = ifelse(grepl(organisms, pattern = "gallinarum"),
        "Other Bacterial Infection", organisms), org_colors = case_when(grepl(x = organisms,
        pattern = "enterococcus faecium", ignore.case = T) ~
        1, grepl(x = organisms, pattern = "enterococcus faecalis",
        ignore.case = T) ~ 2, grepl(x = organisms, pattern = "enterococcus avium",
        ignore.case = T) ~ 3, grepl(x = organisms, pattern = "klebsiella pneumoniae",
        ignore.case = T) ~ 4, grepl(x = organisms, pattern = "escherichia coli",
        ignore.case = T) ~ 5, grepl(x = organisms, pattern = "proteus mirabilis",
        ignore.case = T) ~ 6, grepl(x = organisms, pattern = "citrobacter freundii",
        ignore.case = T) ~ 7, grepl(x = organisms, pattern = "other bacterial infection|gallinarum",
        ignore.case = T) ~ 8, grepl(x = organisms, pattern = "culture negative",
        ignore.case = T) ~ 9, TRUE ~ 0), organisms = as.factor(organisms),
        organisms = factor(organisms, levels = c("Klebsiella pneumoniae",
            "Escherichia coli", "Proteus mirabilis", "Citrobacter freundii",
            "Enterococcus faecium", "Enterococcus faecalis",
            "Enterococcus avium", "Other Bacterial Infection",
            "Culture Negative", "No Bacterial Infection")), org_colors = factor(org_colors,
            levels = c("4", "5", "6", "7", "1", "2", "3", "8",
                "9", "0"))) %>%
    left_join(ebac_infx_orgs) %>%
    mutate(organisms = factor(organisms, levels = c("Klebsiella pneumoniae",
        "Escherichia coli", "Proteus mirabilis", "Citrobacter freundii",
        "Enterococcus faecium", "Enterococcus faecalis", "Enterococcus avium",
        "Other Bacterial Infection", "Culture Negative", "No Bacterial Infection")),
        org_colors = factor(org_colors, levels = c("4", "5",
            "6", "7", "1", "2", "3", "8", "9", "0"))) %>%
    drop_na(organisms) %>%
    mutate(bact_infection_present = ifelse(bact_infection_present ==
        "No", "No Infection", "Infection"), bact_infection_present = factor(bact_infection_present,
        levels = c("Infection", "No Infection")))

gg_ebac_infx_orgs <- ebac_infx_orgs_order %>%
    ggplot(., aes(x = reorder(sampleID, -enterobacterales_rel_abundance),
        y = organisms, fill = as.factor(org_colors))) + geom_tile(color = "black") +
    theme_bw() + theme(axis.title.y = et(color = "black", size = 12),
    axis.text.y = et(color = "black", size = 10), axis.ticks.x = eb(),
    axis.text.x = eb(), axis.title.x = eb(), panel.grid = eb(),
    strip.text = eb()) + scale_fill_manual(values = c("#FF0000",
    "#CC0404", "#8A0202", "#5C0202", "#129246", "#0C7A3A", "#08592B",
    "#E6C66E", "#BD992D", "#00000000"), labels = c("Klebsiella pneumoniae",
    "Escherichia coli", "Citrobacter freundii", "Proteus mirabilis",
    "Enterococcus faecium", "Enterococcus faecalis", "Enterococcus avium",
    "Other Bacterial Infection", "Culture Negative", "No Bacterial Infection")) +
    labs(fill = "Infecting Organism", y = "Infecting Organism\n") +
    scale_y_discrete(expand = expansion(mult = c(0.005, 0.005)),
        limits = rev(levels(ebac_infx_orgs_order$organisms))) +
    facet_grid(. ~ bact_infection_present, space = "free_x",
        scales = "free_x")

# gg_ebac_infx_orgs

yingtools2::gg.stack(gg_ebac_all_patients, gg_ebac_infx_orgs,
    heights = c(1, 0.5), adjust.themes = T)

pdf(file = "./Results/Figure_4B.pdf", height = 8, width = 15,
    onefile = F)

yingtools2::gg.stack(gg_ebac_all_patients, gg_ebac_infx_orgs,
    heights = c(1, 0.5), adjust.themes = T)

invisible(dev.off())

Figure 5B: Quant Metabolomics-Enterococcus Expansion

# Qual compounds that we can quantify and that are also
# significant in the volcano analysis for both
signif_compounds <- qual_tot_ecoc_expan %>%
    rownames_to_column(var = "compound") %>%
    filter(p.adj <= 0.05 & abs(log2fc_val) > 0.75) %>%
    distinct(compound)

metab_boxplot_ecoc_expan <- peri_matrix_all %>%
    select(sampleID, enterococcus_rel_abundance) %>%
    left_join(sample_lookup) %>%
    group_by(sampleID) %>%
    slice(1) %>%
    left_join(metab_quant_converted %>%
        filter(compound %in% signif_compounds$compound)) %>%
    filter(compound %!in% c("Kynurenine", "Anthranilic Acid")) %>%
    mutate(compound = ifelse(compound == "isovaleric-acid", "isovalerate",
        compound), compound = str_to_title(compound)) %>%
    ungroup() %>%
    mutate(db = ifelse(db == "HealthyDonor", "Healthy Donor",
        "Liver Transplant") %>%
        factor(., levels = c("Healthy Donor", "Liver Transplant")),
        class = case_when(compound %in% c("Taurocholic Acid",
            "Glycocholic Acid") ~ "Conjugated Primary Bile Acid",
            compound %in% c("Cholic Acid") ~ "Primary Bile Acid",
            compound %in% c("3-Oxolithocholic Acid", "Alloisolithocholic Acid",
                "Deoxycholic Acid", "Isodeoxycholic Acid", "Lithocholic Acid") ~
                "Secondary Bile Acid", compound %in% c("Threonine",
                "Glycine", "Tyrosine", "Tyramine", "Serine",
                "Leucine", "Isoleucine", "Valine", "Phenylalanine",
                "Alanine", "Proline", "Aspartate", "Methionine",
                "Glutamate", "Lysine", "Cysteine", "Tryptophan") ~
                "Amino Acid", compound %in% c("Acetate", "Butyrate",
                "Succinate", "Propionate") ~ "Short-Chain Fatty Acid",
            compound %in% c("Kynurenic Acid", "Anthranilic Acid",
                "Kynurenine", "Tryptamine") ~ "Kynurenine Metabolite",
            compound == "Desaminotyrosine" ~ "Phenolic Aromatic",
            compound == "Niacin" ~ "B-Vitamin", TRUE ~ "Indole")) %>%
    drop_na() %>%
    group_by(compound) %>%
    mutate(class = factor(class, levels = c("Conjugated Primary Bile Acid",
        "Primary Bile Acid", "Secondary Bile Acid", "Short-Chain Fatty Acid",
        "Amino Acid", "Phenolic Aromatic", "Indole", "Kynurenine Metabolite",
        "B-Vitamin")), compound = case_when(class == "Conjugated Primary Bile Acid" ~
        paste(compound, "(1˚Conj. BA)"), class == "Primary Bile Acid" ~
        paste(compound, "(1˚ BA)"), class == "Secondary Bile Acid" ~
        paste(compound, "(2˚ BA)"), class == "Short-Chain Fatty Acid" ~
        paste(compound, "(SCFA)"), class == "Amino Acid" ~ paste(compound,
        "(AA)"), class == "Phenolic Aromatic" ~ paste(compound,
        "(Phen. Arom.)"), class == "Indole" ~ paste0(compound,
        "(Indole)"), class == "Kynurenine Metabolite" ~ paste(compound,
        "(Kyn. Metab.)"), class == "B-Vitamin" ~ paste(compound,
        "(B-Vitamin)")), compound = factor(compound, levels = c("Acetate (SCFA)",
        "Butyrate (SCFA)", "Propionate (SCFA)", "Succinate (SCFA)",
        "Taurocholic Acid (1˚Conj. BA)", "Glycocholic Acid (1˚Conj. BA)",
        "Cholic Acid (1˚ BA)", "3-Oxolithocholic Acid (2˚ BA)",
        "Alloisolithocholic Acid (2˚ BA)", "Deoxycholic Acid (2˚ BA)",
        "Isodeoxycholic Acid (2˚ BA)", "Lithocholic Acid (2˚ BA)",
        "Kynurenic Acid (Kyn. Metab.)", "Kynurenine (Kyn. Metab.)",
        "Anthranilic Acid (Kyn. Metab.)", "Desaminotyrosine",
        "Niacin", "Tyrosine", "Tryptamine", "Tryptophan", "Phenylalanine"))) %>%
    ungroup() %>%
    drop_na() %>%
    mutate(enterococcus_expansion = ifelse(enterococcus_rel_abundance >=
        optimal_cutpoint_rel$optimal_cutpoint[2], 1, 0)) %>%
    arrange(enterococcus_expansion, sampleID) %>%
    group_by(compound) %>%
    mutate(enterococcus_expansion_0 = length(mvalue__mM[enterococcus_expansion ==
        "0"]), enterococcus_expansion_1 = length(mvalue__mM[enterococcus_expansion ==
        "1"])) %>%
    filter(any(mvalue__mM != 0)) %>%
    mutate(enterococcus_expansion = ifelse(enterococcus_expansion ==
        "1", "Expansion", "No Expansion"))


metab_boxplot_summary_stats_ecoc_expan <- metab_boxplot_ecoc_expan %>%
    group_by(compound) %>%
    summarise(y.position = max(mvalue__mM) * 1.1)

metab_boxplot_stats_ecoc_expan <- metab_boxplot_ecoc_expan %>%
    group_by(class, compound) %>%
    rstatix::wilcox_test(mvalue__mM ~ enterococcus_expansion) %>%
    rstatix::adjust_pvalue(method = "BH") %>%
    rstatix::add_significance("p.adj") %>%
    mutate(p.adj = ifelse(p.adj < 0.001, "p.adj < 0.001", paste("p.adj = ",
        round(p.adj, 3)))) %>%
    add_xy_position(x = "enterococcus_expansion") %>%
    select(-y.position) %>%
    left_join(metab_boxplot_summary_stats_ecoc_expan)


gg_metab_boxplot_ecoc_expan <- ggboxplot(metab_boxplot_ecoc_expan,
    x = "enterococcus_expansion", y = "mvalue__mM", fill = "enterococcus_expansion",
    alpha = 0.65, outlier.shape = NA) + theme(legend.text = et(size = 14,
    color = "black"), legend.title = et(size = 14, color = "black"),
    axis.title.x = eb(), axis.title.y = et(size = 16, color = "black"),
    strip.text = et(size = 14, color = "black"), strip.background = eb(),
    panel.border = eb(), axis.line = eb()) + facet_wrap(~compound,
    scales = "free_y", nrow = 2) + annotate("segment", x = 0.35,
    xend = 0.35, y = 0, yend = Inf, colour = "black", linewidth = 1) +
    annotate("segment", x = 0.35, xend = Inf, y = 0, yend = 0,
        colour = "black") + stat_pvalue_manual(metab_boxplot_stats_ecoc_expan,
    label = "{p.adj}", bracket.size = 0) + geom_point(data = metab_boxplot_ecoc_expan,
    aes(x = enterococcus_expansion, y = mvalue__mM, fill = enterococcus_expansion),
    position = position_jitter(width = 0.2), size = 2, shape = 21,
    alpha = 0.65, color = "black") + scale_fill_manual("Enterococcus",
    values = c("gray87", "#389458")) + scale_y_continuous(expand = expansion(mult = c(0.1,
    0.2))) + ylab("Concentration (mM)\n")

gg_metab_boxplot_ecoc_expan

cairo_pdf(filename = "./Results/Figure_5B.pdf", height = 8, width = 16,
    onefile = TRUE)

gg_metab_boxplot_ecoc_expan

invisible(dev.off())

Figure 6A: sPLS-DA Diversity Groups and Healthy Donors using Qualitative Metabolomics

diversity_metab_mat <-
  metab_qual_anon %>% filter(sampleID %in% first_samps$sampleID) %>% 
  mutate(compound = ifelse(compound == "isovaleric-acid", "isovalerate", compound),
         compound = str_to_title(compound)) %>%
  filter(compound %in% heatmap_cmpds$compound|is.na(compound)) %>%
  ungroup() %>%
  left_join(metaphlan_df_sumry %>% select(sampleID, diversity_group_abv)) %>% 
  group_by(sampleID, compound, diversity_group_abv) %>% 
  summarise(mvalue = mean(mvalue, na.rm = TRUE)) %>% 
  ungroup() %>%
  mutate_all(~replace(., is.nan(.), NA)) %>% 
  select(sampleID, compound, mvalue, diversity_group_abv) %>% 
  drop_na(sampleID) %>% 
  pivot_wider(names_from = "compound", values_from = "mvalue") %>% 
  filter(sampleID != "") %>%
  column_to_rownames(var = "sampleID") %>% 
  select(-`NA`, - diversity_group_abv) %>% 
  filter_all(any_vars(!is.na(.)))

diversity_metab_labs <-
  diversity_metab_mat %>% 
  rownames_to_column(var = "sampleID") %>% 
  left_join(metaphlan_df_sumry %>% select(sampleID, diversity_group_abv)) %>% 
  droplevels() %>% 
  pull(diversity_group_abv)

dim(diversity_metab_mat) #123 93 (means 93 compounds and 102 LT patients/infections)
## [1] 102  93
length(diversity_metab_labs) #123 (means 102 LT patients/infections)
## [1] 102
# Begin model training
set.seed(1234)
diversity_train <- sample(1:nrow(diversity_metab_mat), as.integer(0.7*nrow(diversity_metab_mat))) # randomly select 70% samples in training
diversity_test <- setdiff(1:nrow(diversity_metab_mat), diversity_train) # rest is part of the test set

# store matrices into training and test set:
diversity_metab_mat.train <- diversity_metab_mat[diversity_train, ]
diversity_metab_mat.test <- diversity_metab_mat[diversity_test,]
diversity_metab_labs.train <- diversity_metab_labs[diversity_train]
diversity_metab_labs.test <- diversity_metab_labs[diversity_test]

# Train the model to tune hyperparameters
# Initial model to find optimal number of components to include
set.seed(1234)
diversity_train_splsda <- mixOmics::splsda(diversity_metab_mat.train, diversity_metab_labs.train, ncomp = 5)

# Performance assessment
## 5-fold, 100-repeat cross validation
set.seed(1234)
diversity_train_plsda_perf <-
  perf(
    diversity_train_splsda,
    validation = "Mfold",
    folds = 5,
    progressBar = FALSE,
    auc = TRUE,
    nrepeat = 50
  )

plot(
  diversity_train_plsda_perf,
  col = color.mixo(5:7),
  sd = FALSE,
  auc = TRUE,
  legend.position = "horizontal"
) # ncomp = 4 might be best for classification error rate and max.dist

# Number of optimal variables to select for each component
diversity_train_keepX <- c(1:10,  seq(20, 130, 10))

set.seed(123)
diversity_train_tune_splsda <-
  mixOmics::tune.splsda(
    diversity_metab_mat.train,
    diversity_metab_labs.train,
    ncomp = 4, # Choose 4 components (max) to be safe
    validation = 'Mfold',
    folds = 5,
    dist = 'max.dist',
    progressBar = FALSE,
    auc = TRUE,
    measure = "BER",
    test.keepX = diversity_train_keepX,
    nrepeat = 50
  )

plot(diversity_train_tune_splsda, col = color.jet(4))

diversity_train_error <- diversity_train_tune_splsda$error.rate
diversity_train_ncomp <- diversity_train_tune_splsda$choice.ncomp$ncomp # optimal number of components based on t-tests on the error rate
diversity_train_ncomp #2 components are optimal
## [1] 2
diversity_train_select_keepX <- diversity_train_tune_splsda$choice.keepX[1:diversity_train_ncomp]  # optimal number of variables to select per component
diversity_train_select_keepX
## comp1 comp2 
##    60    10
# Final Model
diversity_train_splsda_final <-
  mixOmics::splsda(diversity_metab_mat.train, diversity_metab_labs.train, ncomp = diversity_train_ncomp, keepX = diversity_train_select_keepX)

# Test the model
diversity_predict_train_splsda_final <- predict(diversity_train_splsda_final, diversity_metab_mat.test, 
                                dist = "max.dist")

diversity_predict_train_comp2 <- diversity_predict_train_splsda_final$class$max.dist[,diversity_train_ncomp]

diversity_union <- union(diversity_predict_train_comp2, diversity_metab_labs.test)

confusionMatrix(table(factor(diversity_predict_train_comp2, diversity_union,
                             levels = c("Low", "Medium", "High")),
                      factor(diversity_metab_labs.test, diversity_union,
                             levels = c("Low", "Medium", "High"))))
## Confusion Matrix and Statistics
## 
##         
##          Low High Medium
##   Low      8    7      5
##   High     0    3      2
##   Medium   0    1      5
## 
## Overall Statistics
##                                           
##                Accuracy : 0.5161          
##                  95% CI : (0.3306, 0.6985)
##     No Information Rate : 0.3871          
##     P-Value [Acc > NIR] : 0.099557        
##                                           
##                   Kappa : 0.3101          
##                                           
##  Mcnemar's Test P-Value : 0.006324        
## 
## Statistics by Class:
## 
##                      Class: Low Class: High Class: Medium
## Sensitivity              1.0000     0.27273        0.4167
## Specificity              0.4783     0.90000        0.9474
## Pos Pred Value           0.4000     0.60000        0.8333
## Neg Pred Value           1.0000     0.69231        0.7200
## Prevalence               0.2581     0.35484        0.3871
## Detection Rate           0.2581     0.09677        0.1613
## Detection Prevalence     0.6452     0.16129        0.1935
## Balanced Accuracy        0.7391     0.58636        0.6820
diversity_train_background <- background.predict(diversity_train_splsda_final,
                                            comp.predicted = 2,
                                            xlim = c(-20,20),
                                            ylim = c(-20,20),
                                            dist = "centroids.dist") 

# Model metrics for all samples
diversity_tot <- predict(diversity_train_splsda_final,
                        diversity_metab_mat,
                        dist = "max.dist")

diversity_tot_predict <- diversity_tot$class$max.dist[,diversity_train_ncomp]

diversity_tot_union <- union(diversity_tot_predict, diversity_metab_labs)

diversity_cm <- confusionMatrix(table(factor(diversity_tot_predict, diversity_tot_union,
                                        levels = c("Low", "Medium", "High")),
                      factor(diversity_metab_labs, diversity_tot_union,
                             levels = c("Low", "Medium", "High"))),
                        )
diversity_cm
## Confusion Matrix and Statistics
## 
##         
##          Low High Medium
##   Low     34   16      7
##   High     2   21      3
##   Medium   1    2     16
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6961          
##                  95% CI : (0.5971, 0.7833)
##     No Information Rate : 0.3824          
##     P-Value [Acc > NIR] : 1.375e-10       
##                                           
##                   Kappa : 0.5341          
##                                           
##  Mcnemar's Test P-Value : 0.001377        
## 
## Statistics by Class:
## 
##                      Class: Low Class: High Class: Medium
## Sensitivity              0.9189      0.5385        0.6154
## Specificity              0.6462      0.9206        0.9605
## Pos Pred Value           0.5965      0.8077        0.8421
## Neg Pred Value           0.9333      0.7632        0.8795
## Prevalence               0.3627      0.3824        0.2549
## Detection Rate           0.3333      0.2059        0.1569
## Detection Prevalence     0.5588      0.2549        0.1863
## Balanced Accuracy        0.7825      0.7295        0.7880
# Additional model measures
diversity_epi <- mltest::ml_test(predicted = factor(diversity_tot_predict, levels = c("Low", "Medium", "High")),
                                 true = factor(diversity_metab_labs, levels = c("Low", "Medium", "High")))

diversity_cm_names <- diversity_cm$table
colnames(diversity_cm_names) <- c("Actual\nLow", "Actual\nMedium", "Actual\nHigh")
rownames(diversity_cm_names) <- c("Predicted\nLow", "Predicted\nMedium", "Predicted\nHigh")

diversity_confusion_df <- diversity_cm_names %>% 
  t() 

{
pdf(file = "./Results/Figure_6A.pdf", height = 10, width = 10)
plotIndiv(
  diversity_train_splsda_final,
  xlim = c(min(diversity_train_splsda_final$variates$X[,1])*1.05, max(diversity_train_splsda_final$variates$X[,1])*1.8),
  ylim = c(min(diversity_train_splsda_final$variates$X[,2])*1.15, max(diversity_train_splsda_final$variates$X[,2])*1.8),
  comp = c(1,2),
  pch = 1,
  ind.names = FALSE,
  legend = FALSE,
  background = diversity_train_background,
  col = c("#3A001E", "#8A0246", "#C20463"),
  star = TRUE,
  point.lwd = 0.5,
  title = NULL,
  size.title = 0.00001,
  style = "graphics",
  legend.title = "Diversity Group",
  X.label = paste0("Component 1 (", round(diversity_train_splsda_final$prop_expl_var$X[1] * 100), "%)"),
  Y.label = paste0("Component 2 (", round(diversity_train_splsda_final$prop_expl_var$X[2] * 100), "%)")
) 
addtable2plot(
  y = -6.68,
  x = min(diversity_train_splsda_final$variates$X[,1])*1.1,
  diversity_confusion_df,
  bty = "o",
  display.rownames = TRUE,
  hlines = TRUE,
  vlines = TRUE,
  cex = 0.75,
  bg = "white"
)
text(
  y = -4.75,
  x = min(diversity_train_splsda_final$variates$X[,1])*1.1,
  paste0("Overall ACC = ", 
                    paste0(formatC(round(diversity_cm$overall[1], 3)*100, digits = 1, format = "f"), "%"), " [",
                    paste0(formatC(round(diversity_cm$overall[3], 3)*100, digits = 1, format = "f"), "%"),", ",
                    paste0(formatC(round(diversity_cm$overall[4], 3)*100, digits = 1, format = "f"), "%"),"]"),
    cex = 0.75, adj = 0)

text(
  y = 3,
  x = 1.5,
  paste0("Low Diversity ACC = ", round(diversity_epi$balanced.accuracy[1]*100, 1), "%"),
    cex = 0.75, adj = 0, col = "#3A001E")
text(
  y = 2.7,
  x = 1.5,
  paste0("Low Diversity Sens. = ", round(diversity_epi$precision[1]*100, 1), "%"),
    cex = 0.75, adj = 0, col = "#3A001E")
text(
  y = 2.4,
  x = 1.5,
  paste0("Low Diversity Spec. = ", round(diversity_epi$specificity[1]*100, 1), "%"),
    cex = 0.75, adj = 0, col = "#3A001E")
# text(
#   y = 2.1,
#   x = 1.5,
#   paste0("Low Diversity OR = ", round(diversity_epi$DOR[1], 1)),
#     cex = 0.75, adj = 0, col = "#3A001E")

text(
  y = -5,
  x = 1.5,
  paste0("Medium Diversity ACC = ", round(diversity_epi$balanced.accuracy[2]*100, 1), "%"),
    cex = 0.75, adj = 0, col = "#8A0246")
text(
  y = -5.3,
  x = 1.5,
  paste0("Medium Diversity Sens. = ", round(diversity_epi$precision[2]*100, 1), "%"),
    cex = 0.75, adj = 0, col = "#8A0246")
text(
  y = -5.6,
  x = 1.5,
  paste0("Medium Diversity Spec. = ", round(diversity_epi$specificity[2]*100, 1), "%"),
    cex = 0.75, adj = 0, col = "#8A0246")
# text(
#   y = -5.9,
#   x = 1.5,
#   paste0("Medium Diversity OR = ", round(diversity_epi$DOR[2], 1)),
#     cex = 0.75, adj = 0, col = "#8A0246")

text(
  y = 3,
  x = min(diversity_train_splsda_final$variates$X[,1])*1.05,
  paste0("High Diversity ACC = ", round(diversity_epi$balanced.accuracy[3]*100, 1), "%"),
    cex = 0.75, adj = 0, col = "#C20463")
text(
  y = 2.7,
  x = min(diversity_train_splsda_final$variates$X[,1])*1.05,
  paste0("High Diversity Sens. = ", round(diversity_epi$precision[3]*100, 1), "%"),
    cex = 0.75, adj = 0, col = "#C20463")
text(
  y = 2.4,
  x = min(diversity_train_splsda_final$variates$X[,1])*1.05,
  paste0("High Diversity Spec. = ", round(diversity_epi$specificity[3]*100, 1), "%"),
    cex = 0.75, adj = 0, col = "#C20463")
# text(
#   y = 2.1,
#   x = min(diversity_train_splsda_final$variates$X[,1])*1.05,
#   paste0("High Diversity OR = ", round(diversity_epi$DOR[3], 1)),
#     cex = 0.75, adj = 0, col = "#C20463")

invisible(dev.off())
}

plotIndiv(
  diversity_train_splsda_final,
  xlim = c(min(diversity_train_splsda_final$variates$X[,1])*1.05, max(diversity_train_splsda_final$variates$X[,1])*1.8),
  ylim = c(min(diversity_train_splsda_final$variates$X[,2])*1.15, max(diversity_train_splsda_final$variates$X[,2])*1.8),
  comp = c(1,2),
  pch = 1,
  ind.names = FALSE,
  legend = FALSE,
  background = diversity_train_background,
  col = c("#3A001E", "#8A0246", "#C20463"),
  star = TRUE,
  point.lwd = 0.5,
  title = NULL,
  size.title = 0.00001,
  style = "graphics",
  legend.title = "Diversity Group",
  X.label = paste0("Component 1 (", round(diversity_train_splsda_final$prop_expl_var$X[1] * 100), "%)"),
  Y.label = paste0("Component 2 (", round(diversity_train_splsda_final$prop_expl_var$X[2] * 100), "%)")
) 
addtable2plot(
  y = -6.68,
  x = min(diversity_train_splsda_final$variates$X[,1])*1.1,
  diversity_confusion_df,
  bty = "o",
  display.rownames = TRUE,
  hlines = TRUE,
  vlines = TRUE,
  cex = 0.75,
  bg = "white"
)
text(
  y = -4.75,
  x = min(diversity_train_splsda_final$variates$X[,1])*1.1,
  paste0("Overall ACC = ", 
                    paste0(formatC(round(diversity_cm$overall[1], 3)*100, digits = 1, format = "f"), "%"), " [",
                    paste0(formatC(round(diversity_cm$overall[3], 3)*100, digits = 1, format = "f"), "%"),", ",
                    paste0(formatC(round(diversity_cm$overall[4], 3)*100, digits = 1, format = "f"), "%"),"]"),
    cex = 0.75, adj = 0)

text(
  y = 3,
  x = 1.5,
  paste0("Low Diversity ACC = ", round(diversity_epi$balanced.accuracy[1]*100, 1), "%"),
    cex = 0.75, adj = 0, col = "#3A001E")
text(
  y = 2.7,
  x = 1.5,
  paste0("Low Diversity Sens. = ", round(diversity_epi$precision[1]*100, 1), "%"),
    cex = 0.75, adj = 0, col = "#3A001E")
text(
  y = 2.4,
  x = 1.5,
  paste0("Low Diversity Spec. = ", round(diversity_epi$specificity[1]*100, 1), "%"),
    cex = 0.75, adj = 0, col = "#3A001E")
# text(
#   y = 2.1,
#   x = 1.5,
#   paste0("Low Diversity OR = ", round(diversity_epi$DOR[1], 1)),
#     cex = 0.75, adj = 0, col = "#3A001E")

text(
  y = -5,
  x = 1.5,
  paste0("Medium Diversity ACC = ", round(diversity_epi$balanced.accuracy[2]*100, 1), "%"),
    cex = 0.75, adj = 0, col = "#8A0246")
text(
  y = -5.3,
  x = 1.5,
  paste0("Medium Diversity Sens. = ", round(diversity_epi$precision[2]*100, 1), "%"),
    cex = 0.75, adj = 0, col = "#8A0246")
text(
  y = -5.6,
  x = 1.5,
  paste0("Medium Diversity Spec. = ", round(diversity_epi$specificity[2]*100, 1), "%"),
    cex = 0.75, adj = 0, col = "#8A0246")
# text(
#   y = -5.9,
#   x = 1.5,
#   paste0("Medium Diversity OR = ", round(diversity_epi$DOR[2], 1)),
#     cex = 0.75, adj = 0, col = "#8A0246")

text(
  y = 3,
  x = min(diversity_train_splsda_final$variates$X[,1])*1.05,
  paste0("High Diversity ACC = ", round(diversity_epi$balanced.accuracy[3]*100, 1), "%"),
    cex = 0.75, adj = 0, col = "#C20463")
text(
  y = 2.7,
  x = min(diversity_train_splsda_final$variates$X[,1])*1.05,
  paste0("High Diversity Sens. = ", round(diversity_epi$precision[3]*100, 1), "%"),
    cex = 0.75, adj = 0, col = "#C20463")
text(
  y = 2.4,
  x = min(diversity_train_splsda_final$variates$X[,1])*1.05,
  paste0("High Diversity Spec. = ", round(diversity_epi$specificity[3]*100, 1), "%"),
    cex = 0.75, adj = 0, col = "#C20463")

# text(
#   y = 2.1,
#   x = min(diversity_train_splsda_final$variates$X[,1])*1.05,
#   paste0("High Diversity OR = ", round(diversity_epi$DOR[3], 1)),
#     cex = 0.75, adj = 0, col = "#C20463")

Figure 6B: sPLS-DA Any Bacterial Infection using Qualitative Metabolomics

infx_metab_mat <-
  metab_qual_anon %>%
  mutate(compound = ifelse(compound == "isovaleric-acid", "isovalerate", compound),
       compound = str_to_title(compound)) %>%
  filter(compound %in% heatmap_cmpds$compound|is.na(compound)) %>%
  ungroup() %>%
  right_join(peri_matrix_all %>% select(sampleID, bact_infection_present)) %>% 
  group_by(sampleID, compound, bact_infection_present) %>% 
  summarise(mvalue = mean(mvalue, na.rm = TRUE)) %>% 
  ungroup() %>%
  mutate_all(~replace(., is.nan(.), NA)) %>% 
  select(sampleID, compound, mvalue, bact_infection_present) %>% 
  drop_na(sampleID) %>% 
  pivot_wider(names_from = "compound", values_from = "mvalue") %>% 
  filter(sampleID != "") %>%
  mutate(bact_infection_present = ifelse(grepl(x = bact_infection_present, pattern = "No"), "No Infection", "Infection")) %>% 
  select(-bact_infection_present) %>% 
  column_to_rownames(var = "sampleID") %>% 
  select(-`NA`) %>% 
  filter_all(any_vars(!is.na(.)))

  
infx_metab_labs <-
  infx_metab_mat %>% 
  rownames_to_column(var = "sampleID") %>% 
  left_join(peri_matrix_all %>% select(sampleID, bact_infection_present)) %>% 
  mutate(bact_infection_present = ifelse(grepl(x = bact_infection_present, pattern = "No"), "No Infection", "Infection")) %>% 
  pull(bact_infection_present)

dim(infx_metab_mat) #108 93 (means 93 compounds and 108 LT patients/infections)
## [1] 108  93
length(infx_metab_labs) #108 (means 108 LT patients/infections)
## [1] 108
# Begin model training
set.seed(1234)
infx_train <- sample(1:nrow(infx_metab_mat), as.integer(0.7*nrow(infx_metab_mat))) # randomly select 70% samples in training
infx_test <- setdiff(1:nrow(infx_metab_mat), infx_train) # rest is part of the test set

# store matrices into training and test set:
infx_metab_mat.train <- infx_metab_mat[infx_train, ]
infx_metab_mat.test <- infx_metab_mat[infx_test,]
infx_metab_labs.train <- infx_metab_labs[infx_train]
infx_metab_labs.test <- infx_metab_labs[infx_test]

# Train the model to tune hyperparameters
# Initial model to find optimal number of components to include
set.seed(1234)
infx_train_splsda <- mixOmics::splsda(infx_metab_mat.train, infx_metab_labs.train, ncomp = 5)

# Performance assessment
## 5-fold, 100-repeat cross validation
set.seed(1234)
infx_train_plsda_perf <-
  perf(
    infx_train_splsda,
    validation = "Mfold",
    folds = 5,
    progressBar = FALSE,
    auc = TRUE,
    nrepeat = 50
  )

plot(
  infx_train_plsda_perf,
  col = color.mixo(5:7),
  sd = FALSE,
  auc = TRUE,
  legend.position = "horizontal"
) # ncomp = 1 is best for classification error rate and max.dist

# Number of optimal variables to select for each component
infx_train_keepX <- c(1:10,  seq(20, 108, 10))

set.seed(123)
infx_train_tune_splsda <-
  mixOmics::tune.splsda(
    infx_metab_mat.train,
    infx_metab_labs.train,
    ncomp = 3, # Choose 3 components (max) to be safe
    validation = 'Mfold',
    folds = 5,
    dist = 'max.dist',
    progressBar = FALSE,
    auc = TRUE,
    measure = "BER",
    test.keepX = infx_train_keepX,
    nrepeat = 50
  )

plot(infx_train_tune_splsda, col = color.jet(3))

infx_train_error <- infx_train_tune_splsda$error.rate
infx_train_ncomp <- infx_train_tune_splsda$choice.ncomp$ncomp # optimal number of components based on t-tests on the error rate
infx_train_ncomp #1 component is optimal
## [1] 1
infx_train_select_keepX <- infx_train_tune_splsda$choice.keepX[1:(infx_train_ncomp + 1)]  # optimal number of variables to select per component
infx_train_select_keepX
## comp1 comp2 
##    90    80
# Final Model
infx_train_splsda_final <-
  mixOmics::splsda(infx_metab_mat.train, infx_metab_labs.train, ncomp = (infx_train_ncomp + 1), keepX = infx_train_select_keepX)

# Test the model
infx_predict_train_splsda_final <- predict(infx_train_splsda_final, infx_metab_mat.test, 
                                dist = "max.dist")

infx_predict_train_comp2 <- infx_predict_train_splsda_final$class$max.dist[,(infx_train_ncomp + 1)]

infx_union <- union(infx_predict_train_comp2, infx_metab_labs.test)

confusionMatrix(table(factor(infx_predict_train_comp2, infx_union),
                      factor(infx_metab_labs.test, infx_union)),
                 positive = "Infection")
## Confusion Matrix and Statistics
## 
##               
##                No Infection Infection
##   No Infection           20         5
##   Infection               7         1
##                                          
##                Accuracy : 0.6364         
##                  95% CI : (0.4512, 0.796)
##     No Information Rate : 0.8182         
##     P-Value [Acc > NIR] : 0.9965         
##                                          
##                   Kappa : -0.082         
##                                          
##  Mcnemar's Test P-Value : 0.7728         
##                                          
##             Sensitivity : 0.1667         
##             Specificity : 0.7407         
##          Pos Pred Value : 0.1250         
##          Neg Pred Value : 0.8000         
##              Prevalence : 0.1818         
##          Detection Rate : 0.0303         
##    Detection Prevalence : 0.2424         
##       Balanced Accuracy : 0.4537         
##                                          
##        'Positive' Class : Infection      
## 
infx_train_background <- background.predict(infx_train_splsda_final,
                                            comp.predicted = 2,
                                            xlim = c(-20,20),
                                            ylim = c(-20,20),
                                            dist = "centroids.dist") 

# Model metrics for all samples
infx_tot <- predict(infx_train_splsda_final,
                        infx_metab_mat,
                        dist = "max.dist")

infx_tot_predict <- infx_tot$class$max.dist[,(infx_train_ncomp + 1)]

infx_tot_union <- union(infx_tot_predict, infx_metab_labs)

infx_cm <- confusionMatrix(table(factor(infx_tot_predict, infx_tot_union,
                                        levels = c("Infection", "No Infection")),
                      factor(infx_metab_labs, infx_tot_union,
                             levels = c("Infection", "No Infection"))),
                      positive = "Infection")
infx_cm
## Confusion Matrix and Statistics
## 
##               
##                No Infection Infection
##   No Infection           17         9
##   Infection              10        72
##                                          
##                Accuracy : 0.8241         
##                  95% CI : (0.739, 0.8906)
##     No Information Rate : 0.75           
##     P-Value [Acc > NIR] : 0.04398        
##                                          
##                   Kappa : 0.525          
##                                          
##  Mcnemar's Test P-Value : 1.00000        
##                                          
##             Sensitivity : 0.8889         
##             Specificity : 0.6296         
##          Pos Pred Value : 0.8780         
##          Neg Pred Value : 0.6538         
##              Prevalence : 0.7500         
##          Detection Rate : 0.6667         
##    Detection Prevalence : 0.7593         
##       Balanced Accuracy : 0.7593         
##                                          
##        'Positive' Class : Infection      
## 
# Additional model measures
infx_epi <- epiR::epi.tests(table(infx_tot_predict, infx_metab_labs), conf.level = 0.95)

infx_confusion_df <- infx_epi$tab %>% 
  t() %>% 
  as.data.frame() %>% 
  rownames_to_column(var = "actual") %>% 
  mutate(actual = case_when(grepl(actual, pattern = "+", fixed = TRUE) ~ "Actual\nInfection",
                            grepl(actual, pattern = "-", fixed = TRUE) ~ "Actual\nNo Infection",
                            TRUE ~ "Total")) %>% 
  dplyr::rename("Predicted\nInfection" = "Test +",
                "Predicted\nNo Infection" = "Test -") %>% 
  column_to_rownames(var = "actual")

{
pdf(file = "./Results/Figure_6B.pdf", height = 10, width = 10)
plotIndiv(
  infx_train_splsda_final,
  comp = c(1,2),
  pch = 1,
  ind.names = FALSE,
  legend = FALSE,
  background = infx_train_background,
  col = c("goldenrod", "gray75"),
  star = TRUE,
  point.lwd = 0.5,
  title = NULL,
  size.title = 0.00001,
  style = "graphics",
  legend.title = "Infection Group",
  X.label = paste0("Component 1 (", round(infx_train_splsda_final$prop_expl_var$X[1] * 100), "%)"),
  Y.label = paste0("Component 2 (", round(infx_train_splsda_final$prop_expl_var$X[2] * 100), "%)")
) 
addtable2plot(
  y = max(infx_train_splsda_final$variates$X[,2])*0.1,
  x = min(infx_train_splsda_final$variates$X[,1]),
  infx_confusion_df,
  bty = "o",
  display.rownames = TRUE,
  hlines = TRUE,
  vlines = TRUE,
  cex = 0.75,
  bg = "white"
)
text(
  y = -4,
  x = -2.75,
  substitute(bold("Infection")), cex = 1.75, adj = 0, col = "goldenrod")
text(
  y = 4,
  x = -1,
  substitute(bold("No Infection")), cex = 1.75, adj = 0, col = "gray70")
text(
  y = max(infx_train_splsda_final$variates$X[,2])*1,
  x = min(infx_train_splsda_final$variates$X[,1]),
  paste0("ACC = ", 
                    round(infx_cm$overall[1]*100, 1), "% [",
                    round(infx_cm$overall[3]*100, 1), "%, ",
                    round(infx_cm$overall[4]*100, 1), "%]"),
    cex = 0.75, adj = 0)
text(
  y = max(infx_train_splsda_final$variates$X[,2])*0.9,
  x = min(infx_train_splsda_final$variates$X[,1]),
  paste0("Sens. = ", 
                    paste0(formatC(round(infx_epi$detail[2][3,], 3)*100, digits = 1, format = "f"), "%"), " [",
                    paste0(formatC(round(infx_epi$detail[3][3,], 3)*100, digits = 1, format = "f"), "%"),", ",
                    paste0(formatC(round(infx_epi$detail[4][3,], 3)*100, digits = 1, format = "f"), "%"),"]"),
    cex = 0.75, adj = 0)
text(
  y = max(infx_train_splsda_final$variates$X[,2])*0.8,
  x = min(infx_train_splsda_final$variates$X[,1]),
  paste0("Spec. = ", 
                    paste0(formatC(round(infx_epi$detail[2][4,], 3)*100, digits = 1, format = "f"), "%"), " [",
                    paste0(formatC(round(infx_epi$detail[3][4,], 3)*100, digits = 1, format = "f"), "%"),", ",
                    paste0(formatC(round(infx_epi$detail[4][4,], 3)*100, digits = 1, format = "f"), "%"),"]"),
    cex = 0.75, adj = 0)
text(
  y = max(infx_train_splsda_final$variates$X[,2])*0.7,
  x = min(infx_train_splsda_final$variates$X[,1]),
  paste0("OR = ",
                    formatC(round(infx_epi$detail[2][6,], 3), digits = 1, format = "f"),  " [",
                    formatC(round(infx_epi$detail[3][6,], 3), digits = 1, format = "f"), ", ",
                    formatC(round(infx_epi$detail[4][6,], 3), digits = 1, format = "f"), "]"),
    cex = 0.75, adj = 0)
invisible(dev.off())
}

 plotIndiv(
  infx_train_splsda_final,
  comp = c(1,2),
  pch = 1,
  ind.names = FALSE,
  legend = FALSE,
  background = infx_train_background,
  col = c("goldenrod", "gray87"),
  star = TRUE,
  point.lwd = 0.5,
  title = NULL,
  size.title = 0.00001,
  style = "graphics",
  legend.title = "Infection Group",
  X.label = paste0("Component 1 (", round(infx_train_splsda_final$prop_expl_var$X[1] * 100), "%)"),
  Y.label = paste0("Component 2 (", round(infx_train_splsda_final$prop_expl_var$X[2] * 100), "%)")
) 
addtable2plot(
  y = max(infx_train_splsda_final$variates$X[,2])*0.1,
  x = min(infx_train_splsda_final$variates$X[,1]),
  infx_confusion_df,
  bty = "o",
  display.rownames = TRUE,
  hlines = TRUE,
  vlines = TRUE,
  cex = 0.75,
  bg = "white"
)
text(
  y = -4,
  x = -2.75,
  substitute(bold("Infection")), cex = 1.75, adj = 0, col = "goldenrod")
text(
  y = 4,
  x = -1,
  substitute(bold("No Infection")), cex = 1.75, adj = 0, col = "gray70")
text(
  y = max(infx_train_splsda_final$variates$X[,2])*1,
  x = min(infx_train_splsda_final$variates$X[,1]),
  paste0("ACC = ", 
                    round(infx_cm$overall[1]*100, 1), "% [",
                    round(infx_cm$overall[3]*100, 1), "%, ",
                    round(infx_cm$overall[4]*100, 1), "%]"),
    cex = 0.75, adj = 0)
text(
  y = max(infx_train_splsda_final$variates$X[,2])*0.9,
  x = min(infx_train_splsda_final$variates$X[,1]),
  paste0("Sens. = ", 
                    paste0(formatC(round(infx_epi$detail[2][3,], 3)*100, digits = 1, format = "f"), "%"), " [",
                    paste0(formatC(round(infx_epi$detail[3][3,], 3)*100, digits = 1, format = "f"), "%"),", ",
                    paste0(formatC(round(infx_epi$detail[4][3,], 3)*100, digits = 1, format = "f"), "%"),"]"),
    cex = 0.75, adj = 0)
text(
  y = max(infx_train_splsda_final$variates$X[,2])*0.8,
  x = min(infx_train_splsda_final$variates$X[,1]),
  paste0("Spec. = ", 
                    paste0(formatC(round(infx_epi$detail[2][4,], 3)*100, digits = 1, format = "f"), "%"), " [",
                    paste0(formatC(round(infx_epi$detail[3][4,], 3)*100, digits = 1, format = "f"), "%"),", ",
                    paste0(formatC(round(infx_epi$detail[4][4,], 3)*100, digits = 1, format = "f"), "%"),"]"),
    cex = 0.75, adj = 0)
text(
  y = max(infx_train_splsda_final$variates$X[,2])*0.7,
  x = min(infx_train_splsda_final$variates$X[,1]),
  paste0("OR = ",
                    formatC(round(infx_epi$detail[2][6,], 3), digits = 1, format = "f"),  " [",
                    formatC(round(infx_epi$detail[3][6,], 3), digits = 1, format = "f"), ", ",
                    formatC(round(infx_epi$detail[4][6,], 3), digits = 1, format = "f"), "]"),
    cex = 0.75, adj = 0)

Combine Figure 6A + 6B: sPLS-DA Diversity Groups and Any Bacterial Infection

{
    pdf(file = "./Results/Figure_6.pdf", height = 10, width = 18)
    par(mfrow = c(1, 2))

    # Figure 6A
    plotIndiv(diversity_train_splsda_final, xlim = c(min(diversity_train_splsda_final$variates$X[,
        1]) * 1.05, max(diversity_train_splsda_final$variates$X[,
        1]) * 1.8), ylim = c(min(diversity_train_splsda_final$variates$X[,
        2]) * 1.15, max(diversity_train_splsda_final$variates$X[,
        2]) * 1.8), comp = c(1, 2), pch = 1, ind.names = FALSE,
        legend = FALSE, background = diversity_train_background,
        col = c("#3A001E", "#8A0246", "#C20463"), star = TRUE,
        point.lwd = 0.5, title = NULL, size.title = 1e-05, style = "graphics",
        legend.title = "Diversity Group", X.label = paste0("Component 1 (",
            round(diversity_train_splsda_final$prop_expl_var$X[1] *
                100), "%)"), Y.label = paste0("Component 2 (",
            round(diversity_train_splsda_final$prop_expl_var$X[2] *
                100), "%)"))
    addtable2plot(y = -4.68, x = min(diversity_train_splsda_final$variates$X[,
        1]) * 1.1, diversity_confusion_df, bty = "o", display.rownames = TRUE,
        hlines = TRUE, vlines = TRUE, cex = 0.75, bg = "white")
    text(y = -6.5, x = min(diversity_train_splsda_final$variates$X[,
        1]) * 1.1, paste0("Overall ACC = ", paste0(formatC(round(diversity_cm$overall[1],
        3) * 100, digits = 1, format = "f"), "%"), " [", paste0(formatC(round(diversity_cm$overall[3],
        3) * 100, digits = 1, format = "f"), "%"), ", ", paste0(formatC(round(diversity_cm$overall[4],
        3) * 100, digits = 1, format = "f"), "%"), "]"), cex = 1.05,
        adj = 0)

    text(y = 3, x = 0.5, paste0("Low Diversity ACC = ", round(diversity_epi$balanced.accuracy[1] *
        100, 1), "%"), cex = 1.05, adj = 0, col = "#3A001E")
    text(y = 2.7, x = 0.5, paste0("Low Diversity Sens. = ", round(diversity_epi$precision[1] *
        100, 1), "%"), cex = 1.05, adj = 0, col = "#3A001E")
    text(y = 2.4, x = 0.5, paste0("Low Diversity Spec. = ", round(diversity_epi$specificity[1] *
        100, 1), "%"), cex = 1.05, adj = 0, col = "#3A001E")
    # text( y = 2.1, x = 0.5, paste0('Low Diversity OR = ',
    # round(diversity_epi$DOR[1], 1)), cex = 1.05, adj = 0,
    # col = '#3A001E')

    text(y = -5.5, x = 0, paste0("Medium Diversity ACC = ", round(diversity_epi$balanced.accuracy[2] *
        100, 1), "%"), cex = 1.05, adj = 0, col = "#8A0246")
    text(y = -5.8, x = 0, paste0("Medium Diversity Sens. = ",
        round(diversity_epi$precision[2] * 100, 1), "%"), cex = 1.05,
        adj = 0, col = "#8A0246")
    text(y = -6.1, x = 0, paste0("Medium Diversity Spec. = ",
        round(diversity_epi$specificity[2] * 100, 1), "%"), cex = 1.05,
        adj = 0, col = "#8A0246")
    # text( y = -7.4, x = 0, paste0('Medium Diversity OR =
    # ', round(diversity_epi$DOR[2], 1)), cex = 1.05, adj =
    # 0, col = '#8A0246')

    text(y = 3, x = min(diversity_train_splsda_final$variates$X[,
        1]) * 1.05, paste0("High Diversity ACC = ", round(diversity_epi$balanced.accuracy[3] *
        100, 1), "%"), cex = 1.05, adj = 0, col = "#C20463")
    text(y = 2.7, x = min(diversity_train_splsda_final$variates$X[,
        1]) * 1.05, paste0("High Diversity Sens. = ", round(diversity_epi$precision[3] *
        100, 1), "%"), cex = 1.05, adj = 0, col = "#C20463")
    text(y = 2.4, x = min(diversity_train_splsda_final$variates$X[,
        1]) * 1.05, paste0("High Diversity Spec. = ", round(diversity_epi$specificity[3] *
        100, 1), "%"), cex = 1.05, adj = 0, col = "#C20463")
    # text( y = 2.1, x =
    # min(diversity_train_splsda_final$variates$X[,1])*1.05,
    # paste0('High Diversity OR = ',
    # round(diversity_epi$DOR[3], 1)), cex = 1.05, adj = 0,
    # col = '#C20463')

    # Figure 6B
    plotIndiv(infx_train_splsda_final, comp = c(1, 2), pch = 1,
        ind.names = FALSE, legend = FALSE, background = infx_train_background,
        col = c("goldenrod", "gray75"), star = TRUE, point.lwd = 0.5,
        title = NULL, size.title = 1e-05, style = "graphics",
        legend.title = "Infection Group", X.label = paste0("Component 1 (",
            round(infx_train_splsda_final$prop_expl_var$X[1] *
                100), "%)"), Y.label = paste0("Component 2 (",
            round(infx_train_splsda_final$prop_expl_var$X[2] *
                100), "%)"))
    addtable2plot(y = max(infx_train_splsda_final$variates$X[,
        2]) * 0.1, x = min(infx_train_splsda_final$variates$X[,
        1]), infx_confusion_df, bty = "o", display.rownames = TRUE,
        hlines = TRUE, vlines = TRUE, cex = 0.75, bg = "white")
    text(y = -4, x = -2.75, substitute(bold("Infection")), cex = 1.75,
        adj = 0, col = "goldenrod")
    text(y = 4, x = -1, substitute(bold("No Infection")), cex = 1.75,
        adj = 0, col = "gray70")
    text(y = max(infx_train_splsda_final$variates$X[, 2]) * 1,
        x = min(infx_train_splsda_final$variates$X[, 1]), paste0("ACC = ",
            round(infx_cm$overall[1] * 100, 1), "% [", round(infx_cm$overall[3] *
                100, 1), "%, ", round(infx_cm$overall[4] * 100,
                1), "%]"), cex = 1.05, adj = 0)
    text(y = max(infx_train_splsda_final$variates$X[, 2]) * 0.9,
        x = min(infx_train_splsda_final$variates$X[, 1]), paste0("Sens. = ",
            paste0(formatC(round(infx_epi$detail[2][3, ], 3) *
                100, digits = 1, format = "f"), "%"), " [", paste0(formatC(round(infx_epi$detail[3][3,
                ], 3) * 100, digits = 1, format = "f"), "%"),
            ", ", paste0(formatC(round(infx_epi$detail[4][3,
                ], 3) * 100, digits = 1, format = "f"), "%"),
            "]"), cex = 1.05, adj = 0)
    text(y = max(infx_train_splsda_final$variates$X[, 2]) * 0.8,
        x = min(infx_train_splsda_final$variates$X[, 1]), paste0("Spec. = ",
            paste0(formatC(round(infx_epi$detail[2][4, ], 3) *
                100, digits = 1, format = "f"), "%"), " [", paste0(formatC(round(infx_epi$detail[3][4,
                ], 3) * 100, digits = 1, format = "f"), "%"),
            ", ", paste0(formatC(round(infx_epi$detail[4][4,
                ], 3) * 100, digits = 1, format = "f"), "%"),
            "]"), cex = 1.05, adj = 0)
    text(y = max(infx_train_splsda_final$variates$X[, 2]) * 0.7,
        x = min(infx_train_splsda_final$variates$X[, 1]), paste0("OR = ",
            formatC(round(infx_epi$detail[2][6, ], 3), digits = 1,
                format = "f"), " [", formatC(round(infx_epi$detail[3][6,
                ], 3), digits = 1, format = "f"), ", ", formatC(round(infx_epi$detail[4][6,
                ], 3), digits = 1, format = "f"), "]"), cex = 1.05,
        adj = 0)

    invisible(dev.off())
}


# Figure 6A
par(mfrow = c(1, 2))

plotIndiv(diversity_train_splsda_final, xlim = c(min(diversity_train_splsda_final$variates$X[,
    1]) * 1.05, max(diversity_train_splsda_final$variates$X[,
    1]) * 1.8), ylim = c(min(diversity_train_splsda_final$variates$X[,
    2]) * 1.15, max(diversity_train_splsda_final$variates$X[,
    2]) * 1.8), comp = c(1, 2), pch = 1, ind.names = FALSE, legend = FALSE,
    background = diversity_train_background, col = c("#3A001E",
        "#8A0246", "#C20463"), star = TRUE, point.lwd = 0.5,
    title = NULL, size.title = 1e-05, style = "graphics", legend.title = "Diversity Group",
    X.label = paste0("Component 1 (", round(diversity_train_splsda_final$prop_expl_var$X[1] *
        100), "%)"), Y.label = paste0("Component 2 (", round(diversity_train_splsda_final$prop_expl_var$X[2] *
        100), "%)"))
addtable2plot(y = -4.68, x = min(diversity_train_splsda_final$variates$X[,
    1]) * 1.1, diversity_confusion_df, bty = "o", display.rownames = TRUE,
    hlines = TRUE, vlines = TRUE, cex = 0.75, bg = "white")
text(y = -6.5, x = min(diversity_train_splsda_final$variates$X[,
    1]) * 1.1, paste0("Overall ACC = ", paste0(formatC(round(diversity_cm$overall[1],
    3) * 100, digits = 1, format = "f"), "%"), " [", paste0(formatC(round(diversity_cm$overall[3],
    3) * 100, digits = 1, format = "f"), "%"), ", ", paste0(formatC(round(diversity_cm$overall[4],
    3) * 100, digits = 1, format = "f"), "%"), "]"), cex = 1.05,
    adj = 0)

text(y = 3, x = 0.5, paste0("Low Diversity ACC = ", round(diversity_epi$balanced.accuracy[1] *
    100, 1), "%"), cex = 1.05, adj = 0, col = "#3A001E")
text(y = 2.7, x = 0.5, paste0("Low Diversity Sens. = ", round(diversity_epi$precision[1] *
    100, 1), "%"), cex = 1.05, adj = 0, col = "#3A001E")
text(y = 2.4, x = 0.5, paste0("Low Diversity Spec. = ", round(diversity_epi$specificity[1] *
    100, 1), "%"), cex = 1.05, adj = 0, col = "#3A001E")
# text( y = 2.1, x = 0.5, paste0('Low Diversity OR = ',
# round(diversity_epi$DOR[1], 1)), cex = 1.05, adj = 0, col
# = '#3A001E')

text(y = -5.5, x = 0, paste0("Medium Diversity ACC = ", round(diversity_epi$balanced.accuracy[2] *
    100, 1), "%"), cex = 1.05, adj = 0, col = "#8A0246")
text(y = -5.8, x = 0, paste0("Medium Diversity Sens. = ", round(diversity_epi$precision[2] *
    100, 1), "%"), cex = 1.05, adj = 0, col = "#8A0246")
text(y = -6.1, x = 0, paste0("Medium Diversity Spec. = ", round(diversity_epi$specificity[2] *
    100, 1), "%"), cex = 1.05, adj = 0, col = "#8A0246")
# text( y = -7.4, x = 0, paste0('Medium Diversity OR = ',
# round(diversity_epi$DOR[2], 1)), cex = 1.05, adj = 0, col
# = '#8A0246')

text(y = 3, x = min(diversity_train_splsda_final$variates$X[,
    1]) * 1.05, paste0("High Diversity ACC = ", round(diversity_epi$balanced.accuracy[3] *
    100, 1), "%"), cex = 1.05, adj = 0, col = "#C20463")
text(y = 2.7, x = min(diversity_train_splsda_final$variates$X[,
    1]) * 1.05, paste0("High Diversity Sens. = ", round(diversity_epi$precision[3] *
    100, 1), "%"), cex = 1.05, adj = 0, col = "#C20463")
text(y = 2.4, x = min(diversity_train_splsda_final$variates$X[,
    1]) * 1.05, paste0("High Diversity Spec. = ", round(diversity_epi$specificity[3] *
    100, 1), "%"), cex = 1.05, adj = 0, col = "#C20463")
# text( y = 2.1, x =
# min(diversity_train_splsda_final$variates$X[,1])*1.05,
# paste0('High Diversity OR = ',
# round(diversity_epi$DOR[3], 1)), cex = 1.05, adj = 0, col
# = '#C20463')

# Figure 6B
plotIndiv(infx_train_splsda_final, comp = c(1, 2), pch = 1, ind.names = FALSE,
    legend = FALSE, background = infx_train_background, col = c("goldenrod",
        "gray75"), star = TRUE, point.lwd = 0.5, title = NULL,
    size.title = 1e-05, style = "graphics", legend.title = "Infection Group",
    X.label = paste0("Component 1 (", round(infx_train_splsda_final$prop_expl_var$X[1] *
        100), "%)"), Y.label = paste0("Component 2 (", round(infx_train_splsda_final$prop_expl_var$X[2] *
        100), "%)"))
addtable2plot(y = max(infx_train_splsda_final$variates$X[, 2]) *
    0.1, x = min(infx_train_splsda_final$variates$X[, 1]), infx_confusion_df,
    bty = "o", display.rownames = TRUE, hlines = TRUE, vlines = TRUE,
    cex = 0.75, bg = "white")
text(y = -4, x = -2.75, substitute(bold("Infection")), cex = 1.75,
    adj = 0, col = "goldenrod")
text(y = 4, x = -1, substitute(bold("No Infection")), cex = 1.75,
    adj = 0, col = "gray70")
text(y = max(infx_train_splsda_final$variates$X[, 2]) * 1, x = min(infx_train_splsda_final$variates$X[,
    1]), paste0("ACC = ", round(infx_cm$overall[1] * 100, 1),
    "% [", round(infx_cm$overall[3] * 100, 1), "%, ", round(infx_cm$overall[4] *
        100, 1), "%]"), cex = 1.05, adj = 0)
text(y = max(infx_train_splsda_final$variates$X[, 2]) * 0.9,
    x = min(infx_train_splsda_final$variates$X[, 1]), paste0("Sens. = ",
        paste0(formatC(round(infx_epi$detail[2][3, ], 3) * 100,
            digits = 1, format = "f"), "%"), " [", paste0(formatC(round(infx_epi$detail[3][3,
            ], 3) * 100, digits = 1, format = "f"), "%"), ", ",
        paste0(formatC(round(infx_epi$detail[4][3, ], 3) * 100,
            digits = 1, format = "f"), "%"), "]"), cex = 1.05,
    adj = 0)
text(y = max(infx_train_splsda_final$variates$X[, 2]) * 0.8,
    x = min(infx_train_splsda_final$variates$X[, 1]), paste0("Spec. = ",
        paste0(formatC(round(infx_epi$detail[2][4, ], 3) * 100,
            digits = 1, format = "f"), "%"), " [", paste0(formatC(round(infx_epi$detail[3][4,
            ], 3) * 100, digits = 1, format = "f"), "%"), ", ",
        paste0(formatC(round(infx_epi$detail[4][4, ], 3) * 100,
            digits = 1, format = "f"), "%"), "]"), cex = 1.05,
    adj = 0)
text(y = max(infx_train_splsda_final$variates$X[, 2]) * 0.7,
    x = min(infx_train_splsda_final$variates$X[, 1]), paste0("OR = ",
        formatC(round(infx_epi$detail[2][6, ], 3), digits = 1,
            format = "f"), " [", formatC(round(infx_epi$detail[3][6,
            ], 3), digits = 1, format = "f"), ", ", formatC(round(infx_epi$detail[4][6,
            ], 3), digits = 1, format = "f"), "]"), cex = 1.05,
    adj = 0)

Supplemental Figure 2A + 2B: Plot Loadings for Diversity Groups

# Take absolute value to make diversity group loadings plot
# narrower
diversity_train_splsda_final$loadings$X <- abs(diversity_train_splsda_final$loadings$X)

# Combine Supplemental Figures 2A + 2B

pdf(file = "./Results/Supplemental_Figure_2.pdf", height = 8,
    width = 10)

par(mfrow = c(1, 3))

# Supplmental Figure 2A
plotLoadings(diversity_train_splsda_final, contrib = "max", method = "mean",
    comp = 1, legend = FALSE, legend.col = c(c("#3A001E", "#8A0246",
        "#C20463")), size.name = 0.6, size.title = rel(1), ndisplay = 50)

# Supplmental Figure 2B
plotLoadings(diversity_train_splsda_final, contrib = "max", method = "mean",
    comp = 2, legend.col = c("#3A001E", "#8A0246", "#C20463"),
    size.name = 0.6, size.title = rel(1), ndisplay = 50)

invisible(dev.off())


par(mfrow = c(1, 3))

# Supplmental Figure 2A
plotLoadings(diversity_train_splsda_final, contrib = "max", method = "mean",
    comp = 1, legend = FALSE, legend.col = c(c("#3A001E", "#8A0246",
        "#C20463")), size.name = 0.6, size.title = rel(1), ndisplay = 50)

# Supplmental Figure 2B
plotLoadings(diversity_train_splsda_final, contrib = "max", method = "mean",
    comp = 2, legend.col = c("#3A001E", "#8A0246", "#C20463"),
    size.name = 0.6, size.title = rel(1), ndisplay = 50)

Supplemental Figure 3A + 3B: Plot Loadings for Any Bacterial Infection

# Take absolute value to make diversity group loadings plot
# narrower
infx_train_splsda_final$loadings$X <- abs(infx_train_splsda_final$loadings$X)

# Combine Supplemental Figures 2A + 2B

pdf(file = "./Results/Supplemental_Figure_3.pdf", height = 8,
    width = 10)

par(mfrow = c(1, 3))

# Supplmental Figure 3A
plotLoadings(infx_train_splsda_final, contrib = "max", method = "mean",
    comp = 1, legend = FALSE, legend.col = c("goldenrod", "gray75"),
    size.name = 0.6, size.title = rel(1), ndisplay = 50)

# Supplmental Figure 3B
plotLoadings(infx_train_splsda_final, contrib = "max", method = "mean",
    comp = 2, legend.col = c("goldenrod", "gray75"), size.name = 0.6,
    size.title = rel(1), ndisplay = 50)

invisible(dev.off())

par(mfrow = c(1, 3))

# Supplmental Figure 3A
plotLoadings(infx_train_splsda_final, contrib = "max", method = "mean",
    comp = 1, legend = FALSE, legend.col = c("goldenrod", "gray75"),
    size.name = 0.6, size.title = rel(1), ndisplay = 50)

# Supplmental Figure 3B
plotLoadings(infx_train_splsda_final, contrib = "max", method = "mean",
    comp = 2, legend.col = c("goldenrod", "gray75"), size.name = 0.6,
    size.title = rel(1), ndisplay = 50)

Clinical Tables

Supplemental Figure 3: Flow Diagram

flow_chart <- flow_exclusions(incl_counts = c(158, 130, 107,
    25), total_label = "Total Patients Enrolled", incl_labels = c("Patients w/ Transplant",
    "Patients Included", "Patients w/ Bacterial Infection"),
    excl_labels = c("No Transplant", "No Sample In Study Period\nDay -7 to +30",
        "Patients w/o Bacterial Infection"), show_count = TRUE)

flow_chart
flow_chart %>%
    export_svg() %>%
    charToRaw() %>%
    rsvg_pdf("./Results/Supplemental_Figure_4.pdf")

Demographics

## Vector of variables to summarize
demo_vars <- c("race", "sex", "age", "meld_transplant", "Alcoholic Hepatitis",
    "Alcoholic Cirrhosis", "NAFLD/NASH", "Primary Sclerosing Cholangitis",
    "Acute Viral Hepatitis", "Chronic Hepatitis B", "Chronic Hepatitis C",
    "Autoimmune", "Wilson's Disease", "Alpha-1 Antitrypsin",
    "Hemachromatosis", "Drug Induced Liver Injury or Toxin",
    "Budd Chiari", "Cryptogenic", "Malignancy", "Other", "Dialysis",
    "Pressers", "Mechanical Ventilation")
## Vector of categorical variables that need transformation
demo_cats <- c("race", "sex", "Alcoholic Hepatitis", "Alcoholic Cirrhosis",
    "NAFLD/NASH", "Primary Sclerosing Cholangitis", "Acute Viral Hepatitis",
    "Chronic Hepatitis B", "Chronic Hepatitis C", "Autoimmune",
    "Wilson's Disease", "Alpha-1 Antitrypsin", "Hemachromatosis",
    "Drug Induced Liver Injury or Toxin", "Budd Chiari", "Cryptogenic",
    "Malignancy", "Other", "Dialysis", "Pressers", "Mechanical Ventilation")


tab1_1 <- CreateTableOne(vars = demo_vars, testNonNormal = "wilcox.test",
    includeNA = TRUE, factorVars = demo_cats, strata = "any_infection",
    data = demo)

summary(tab1_1)  # Age is potentially skewed, need to state that it is skewed and re-run `CreateTableOne`
## 
##      ### Summary of continuous variables ###
## 
## any_infection: 0
##                  n miss p.miss mean sd median p25 p75 min max  skew kurt
## age             82    0      0   51 17     55  40  63   2  77 -1.00  0.8
## meld_transplant 82    0      0   26 10     28  19  33   6  49 -0.04 -0.7
## ------------------------------------------------------------ 
## any_infection: 1
##                  n miss p.miss mean sd median p25 p75 min max skew   kurt
## age             25    0      0   56 15     59  46  68  22  73 -0.9  0.002
## meld_transplant 25    0      0   27 10     30  19  33  11  42 -0.3 -1.057
## 
## p-values
##                   pNormal pNonNormal
## age             0.2072738  0.1765252
## meld_transplant 0.6624494  0.6085202
## 
## Standardize mean differences
##                    1 vs 2
## age             0.2976093
## meld_transplant 0.1025141
## 
## =======================================================================================
## 
##      ### Summary of categorical variables ### 
## 
## any_infection: 0
##                                 var  n miss p.miss
##                                race 82    0    0.0
##                                                   
##                                                   
##                                                   
##                                                   
##                                                   
##                                                   
##                                                   
##                                 sex 82    0    0.0
##                                                   
##                                                   
##                 Alcoholic Hepatitis 82    0    0.0
##                                                   
##                                                   
##                 Alcoholic Cirrhosis 82    0    0.0
##                                                   
##                                                   
##                          NAFLD/NASH 82    0    0.0
##                                                   
##                                                   
##      Primary Sclerosing Cholangitis 82    0    0.0
##                                                   
##                                                   
##               Acute Viral Hepatitis 82    0    0.0
##                                                   
##                                                   
##                 Chronic Hepatitis B 82    0    0.0
##                                                   
##                                                   
##                 Chronic Hepatitis C 82    0    0.0
##                                                   
##                                                   
##                          Autoimmune 82    0    0.0
##                                                   
##                                                   
##                    Wilson's Disease 82    0    0.0
##                                                   
##                                                   
##                 Alpha-1 Antitrypsin 82    0    0.0
##                                                   
##                     Hemachromatosis 82    0    0.0
##                                                   
##                                                   
##  Drug Induced Liver Injury or Toxin 82    0    0.0
##                                                   
##                                                   
##                         Budd Chiari 82    0    0.0
##                                                   
##                         Cryptogenic 82    0    0.0
##                                                   
##                                                   
##                          Malignancy 82    0    0.0
##                                                   
##                                                   
##                               Other 82    0    0.0
##                                                   
##                                                   
##                            Dialysis 82    0    0.0
##                                                   
##                                                   
##                            Pressers 82    0    0.0
##                                                   
##                                                   
##              Mechanical Ventilation 82    0    0.0
##                                                   
##                                                   
##                             level freq percent cum.percent
##  American Indian or Alaska Native    1     1.2         1.2
##              Asian/Mideast Indian    6     7.3         8.5
##            Black/African-American    9    11.0        19.5
##                More than one Race    8     9.8        29.3
##                  Patient Declined    4     4.9        34.1
##                           Unknown    0     0.0        34.1
##                             White   54    65.9       100.0
##                                                           
##                            Female   38    46.3        46.3
##                              Male   44    53.7       100.0
##                                                           
##                                 0   76    92.7        92.7
##                                 1    6     7.3       100.0
##                                                           
##                                 0   42    51.2        51.2
##                                 1   40    48.8       100.0
##                                                           
##                                 0   72    87.8        87.8
##                                 1   10    12.2       100.0
##                                                           
##                                 0   76    92.7        92.7
##                                 1    6     7.3       100.0
##                                                           
##                                 0   78    95.1        95.1
##                                 1    4     4.9       100.0
##                                                           
##                                 0   82   100.0       100.0
##                                 1    0     0.0       100.0
##                                                           
##                                 0   78    95.1        95.1
##                                 1    4     4.9       100.0
##                                                           
##                                 0   77    93.9        93.9
##                                 1    5     6.1       100.0
##                                                           
##                                 0   80    97.6        97.6
##                                 1    2     2.4       100.0
##                                                           
##                                 0   82   100.0       100.0
##                                                           
##                                 0   82   100.0       100.0
##                                 1    0     0.0       100.0
##                                                           
##                                 0   81    98.8        98.8
##                                 1    1     1.2       100.0
##                                                           
##                                 0   82   100.0       100.0
##                                                           
##                                 0   78    95.1        95.1
##                                 1    4     4.9       100.0
##                                                           
##                                 0   65    79.3        79.3
##                                 1   17    20.7       100.0
##                                                           
##                                 0   74    90.2        90.2
##                                 1    8     9.8       100.0
##                                                           
##                                 0   60    73.2        73.2
##                                 1   22    26.8       100.0
##                                                           
##                                 0   76    92.7        92.7
##                                 1    6     7.3       100.0
##                                                           
##                                 0   77    93.9        93.9
##                                 1    5     6.1       100.0
##                                                           
## ------------------------------------------------------------ 
## any_infection: 1
##                                 var  n miss p.miss
##                                race 25    0    0.0
##                                                   
##                                                   
##                                                   
##                                                   
##                                                   
##                                                   
##                                                   
##                                 sex 25    0    0.0
##                                                   
##                                                   
##                 Alcoholic Hepatitis 25    0    0.0
##                                                   
##                                                   
##                 Alcoholic Cirrhosis 25    0    0.0
##                                                   
##                                                   
##                          NAFLD/NASH 25    0    0.0
##                                                   
##                                                   
##      Primary Sclerosing Cholangitis 25    0    0.0
##                                                   
##                                                   
##               Acute Viral Hepatitis 25    0    0.0
##                                                   
##                                                   
##                 Chronic Hepatitis B 25    0    0.0
##                                                   
##                                                   
##                 Chronic Hepatitis C 25    0    0.0
##                                                   
##                                                   
##                          Autoimmune 25    0    0.0
##                                                   
##                                                   
##                    Wilson's Disease 25    0    0.0
##                                                   
##                                                   
##                 Alpha-1 Antitrypsin 25    0    0.0
##                                                   
##                     Hemachromatosis 25    0    0.0
##                                                   
##                                                   
##  Drug Induced Liver Injury or Toxin 25    0    0.0
##                                                   
##                                                   
##                         Budd Chiari 25    0    0.0
##                                                   
##                         Cryptogenic 25    0    0.0
##                                                   
##                                                   
##                          Malignancy 25    0    0.0
##                                                   
##                                                   
##                               Other 25    0    0.0
##                                                   
##                                                   
##                            Dialysis 25    0    0.0
##                                                   
##                                                   
##                            Pressers 25    0    0.0
##                                                   
##                                                   
##              Mechanical Ventilation 25    0    0.0
##                                                   
##                                                   
##                             level freq percent cum.percent
##  American Indian or Alaska Native    0     0.0         0.0
##              Asian/Mideast Indian    2     8.0         8.0
##            Black/African-American    2     8.0        16.0
##                More than one Race    2     8.0        24.0
##                  Patient Declined    1     4.0        28.0
##                           Unknown    2     8.0        36.0
##                             White   16    64.0       100.0
##                                                           
##                            Female    9    36.0        36.0
##                              Male   16    64.0       100.0
##                                                           
##                                 0   23    92.0        92.0
##                                 1    2     8.0       100.0
##                                                           
##                                 0   17    68.0        68.0
##                                 1    8    32.0       100.0
##                                                           
##                                 0   19    76.0        76.0
##                                 1    6    24.0       100.0
##                                                           
##                                 0   25   100.0       100.0
##                                 1    0     0.0       100.0
##                                                           
##                                 0   25   100.0       100.0
##                                 1    0     0.0       100.0
##                                                           
##                                 0   24    96.0        96.0
##                                 1    1     4.0       100.0
##                                                           
##                                 0   25   100.0       100.0
##                                 1    0     0.0       100.0
##                                                           
##                                 0   25   100.0       100.0
##                                 1    0     0.0       100.0
##                                                           
##                                 0   24    96.0        96.0
##                                 1    1     4.0       100.0
##                                                           
##                                 0   25   100.0       100.0
##                                                           
##                                 0   24    96.0        96.0
##                                 1    1     4.0       100.0
##                                                           
##                                 0   25   100.0       100.0
##                                 1    0     0.0       100.0
##                                                           
##                                 0   25   100.0       100.0
##                                                           
##                                 0   24    96.0        96.0
##                                 1    1     4.0       100.0
##                                                           
##                                 0   19    76.0        76.0
##                                 1    6    24.0       100.0
##                                                           
##                                 0   20    80.0        80.0
##                                 1    5    20.0       100.0
##                                                           
##                                 0   16    64.0        64.0
##                                 1    9    36.0       100.0
##                                                           
##                                 0   20    80.0        80.0
##                                 1    5    20.0       100.0
##                                                           
##                                 0   23    92.0        92.0
##                                 1    2     8.0       100.0
##                                                           
## 
## p-values
##                                      pApprox    pExact
## race                               0.3074905 0.4275466
## sex                                0.4953032 0.4904875
## Alcoholic Hepatitis                1.0000000 1.0000000
## Alcoholic Cirrhosis                0.2123469 0.1713637
## NAFLD/NASH                         0.2590605 0.1979409
## Primary Sclerosing Cholangitis     0.3704754 0.3322151
## Acute Viral Hepatitis              0.6007080 0.5709809
## Chronic Hepatitis B                0.5271112 0.2336449
## Chronic Hepatitis C                0.6007080 0.5709809
## Autoimmune                         0.4694777 0.5886832
## Wilson's Disease                   1.0000000 0.5538202
## Alpha-1 Antitrypsin                       NA        NA
## Hemachromatosis                    0.5271112 0.2336449
## Drug Induced Liver Injury or Toxin 1.0000000 1.0000000
## Budd Chiari                               NA        NA
## Cryptogenic                        1.0000000 1.0000000
## Malignancy                         0.9440592 0.7828238
## Other                              0.3063991 0.1774710
## Dialysis                           0.5266900 0.4513768
## Pressers                           0.1465607 0.1238754
## Mechanical Ventilation             1.0000000 0.6642869
## 
## Standardize mean differences
##                                        1 vs 2
## race                               0.45891730
## sex                                0.21130091
## Alcoholic Hepatitis                0.02568258
## Alcoholic Cirrhosis                0.34709743
## NAFLD/NASH                         0.31029007
## Primary Sclerosing Cholangitis     0.39735971
## Acute Viral Hepatitis              0.32025631
## Chronic Hepatitis B                0.28867513
## Chronic Hepatitis C                0.32025631
## Autoimmune                         0.36037499
## Wilson's Disease                   0.08851811
## Alpha-1 Antitrypsin                0.00000000
## Hemachromatosis                    0.28867513
## Drug Induced Liver Injury or Toxin 0.15713484
## Budd Chiari                        0.00000000
## Cryptogenic                        0.04264158
## Malignancy                         0.07849392
## Other                              0.29088216
## Dialysis                           0.19854168
## Pressers                           0.37578690
## Mechanical Ventilation             0.07437488
tableone_skewed <- c("age", "meld_transplant")

tab1_2 <- print(tab1_1, nonnormal = tableone_skewed, formatOptions = list(big.mark = ","))
##                                             Stratified by any_infection
##                                              0                   
##   n                                             82               
##   race (%)                                                       
##      American Indian or Alaska Native            1 (  1.2)       
##      Asian/Mideast Indian                        6 (  7.3)       
##      Black/African-American                      9 ( 11.0)       
##      More than one Race                          8 (  9.8)       
##      Patient Declined                            4 (  4.9)       
##      Unknown                                     0 (  0.0)       
##      White                                      54 ( 65.9)       
##   sex = Male (%)                                44 ( 53.7)       
##   age (median [IQR])                         55.00 [39.50, 63.00]
##   meld_transplant (median [IQR])             28.00 [19.00, 33.00]
##   Alcoholic Hepatitis = 1 (%)                    6 (  7.3)       
##   Alcoholic Cirrhosis = 1 (%)                   40 ( 48.8)       
##   NAFLD/NASH = 1 (%)                            10 ( 12.2)       
##   Primary Sclerosing Cholangitis = 1 (%)         6 (  7.3)       
##   Acute Viral Hepatitis = 1 (%)                  4 (  4.9)       
##   Chronic Hepatitis B = 1 (%)                    0 (  0.0)       
##   Chronic Hepatitis C = 1 (%)                    4 (  4.9)       
##   Autoimmune = 1 (%)                             5 (  6.1)       
##   Wilson's Disease = 1 (%)                       2 (  2.4)       
##   Alpha-1 Antitrypsin = 0 (%)                   82 (100.0)       
##   Hemachromatosis = 1 (%)                        0 (  0.0)       
##   Drug Induced Liver Injury or Toxin = 1 (%)     1 (  1.2)       
##   Budd Chiari = 0 (%)                           82 (100.0)       
##   Cryptogenic = 1 (%)                            4 (  4.9)       
##   Malignancy = 1 (%)                            17 ( 20.7)       
##   Other = 1 (%)                                  8 (  9.8)       
##   Dialysis = 1 (%)                              22 ( 26.8)       
##   Pressers = 1 (%)                               6 (  7.3)       
##   Mechanical Ventilation = 1 (%)                 5 (  6.1)       
##                                             Stratified by any_infection
##                                              1                    p     
##   n                                             25                      
##   race (%)                                                         0.307
##      American Indian or Alaska Native            0 (  0.0)              
##      Asian/Mideast Indian                        2 (  8.0)              
##      Black/African-American                      2 (  8.0)              
##      More than one Race                          2 (  8.0)              
##      Patient Declined                            1 (  4.0)              
##      Unknown                                     2 (  8.0)              
##      White                                      16 ( 64.0)              
##   sex = Male (%)                                16 ( 64.0)         0.495
##   age (median [IQR])                         59.00 [46.00, 68.00]  0.177
##   meld_transplant (median [IQR])             30.00 [19.00, 33.00]  0.609
##   Alcoholic Hepatitis = 1 (%)                    2 (  8.0)         1.000
##   Alcoholic Cirrhosis = 1 (%)                    8 ( 32.0)         0.212
##   NAFLD/NASH = 1 (%)                             6 ( 24.0)         0.259
##   Primary Sclerosing Cholangitis = 1 (%)         0 (  0.0)         0.370
##   Acute Viral Hepatitis = 1 (%)                  0 (  0.0)         0.601
##   Chronic Hepatitis B = 1 (%)                    1 (  4.0)         0.527
##   Chronic Hepatitis C = 1 (%)                    0 (  0.0)         0.601
##   Autoimmune = 1 (%)                             0 (  0.0)         0.469
##   Wilson's Disease = 1 (%)                       1 (  4.0)         1.000
##   Alpha-1 Antitrypsin = 0 (%)                   25 (100.0)            NA
##   Hemachromatosis = 1 (%)                        1 (  4.0)         0.527
##   Drug Induced Liver Injury or Toxin = 1 (%)     0 (  0.0)         1.000
##   Budd Chiari = 0 (%)                           25 (100.0)            NA
##   Cryptogenic = 1 (%)                            1 (  4.0)         1.000
##   Malignancy = 1 (%)                             6 ( 24.0)         0.944
##   Other = 1 (%)                                  5 ( 20.0)         0.306
##   Dialysis = 1 (%)                               9 ( 36.0)         0.527
##   Pressers = 1 (%)                               5 ( 20.0)         0.147
##   Mechanical Ventilation = 1 (%)                 2 (  8.0)         1.000
##                                             Stratified by any_infection
##                                              test   
##   n                                                 
##   race (%)                                          
##      American Indian or Alaska Native               
##      Asian/Mideast Indian                           
##      Black/African-American                         
##      More than one Race                             
##      Patient Declined                               
##      Unknown                                        
##      White                                          
##   sex = Male (%)                                    
##   age (median [IQR])                         nonnorm
##   meld_transplant (median [IQR])             nonnorm
##   Alcoholic Hepatitis = 1 (%)                       
##   Alcoholic Cirrhosis = 1 (%)                       
##   NAFLD/NASH = 1 (%)                                
##   Primary Sclerosing Cholangitis = 1 (%)            
##   Acute Viral Hepatitis = 1 (%)                     
##   Chronic Hepatitis B = 1 (%)                       
##   Chronic Hepatitis C = 1 (%)                       
##   Autoimmune = 1 (%)                                
##   Wilson's Disease = 1 (%)                          
##   Alpha-1 Antitrypsin = 0 (%)                       
##   Hemachromatosis = 1 (%)                           
##   Drug Induced Liver Injury or Toxin = 1 (%)        
##   Budd Chiari = 0 (%)                               
##   Cryptogenic = 1 (%)                               
##   Malignancy = 1 (%)                                
##   Other = 1 (%)                                     
##   Dialysis = 1 (%)                                  
##   Pressers = 1 (%)                                  
##   Mechanical Ventilation = 1 (%)
write.csv(tab1_2, "./Results/Demo_Table_1.csv", row.names = TRUE)  # Saving then reading in the same data allows for an easy way to adjust p-values, since it loads the object as a dataframe

# Need to adjust pvalues and arrange properly....hence the
# multiple dataframes below
tab1_2_padjust1 <- read.csv("./Results/Demo_Table_1.csv") %>%
    dplyr::rename(` ` = X, `No Infection` = X0, `Bacterial Infection` = X1)

tab1_2_padjust2 <- tab1_2_padjust1 %>%
    mutate(` ` = factor(` `, levels = tab1_2_padjust1$` `))

tab1_2_padjust3 <- tab1_2_padjust1 %>%
    mutate(test = ifelse(!is.na(p) & test == "", "chi.sq", test)) %>%
    group_by(test) %>%
    rstatix::adjust_pvalue(p.col = "p", method = "BH") %>%
    ungroup() %>%
    mutate(` ` = factor(` `, tab1_2_padjust2$` `)) %>%
    arrange(` `) %>%
    mutate(p = ifelse(is.na(p), "", p), p.adj = ifelse(is.na(p.adj),
        "", p.adj))

# Read in csv to then append adjusted pvalues
write.csv(tab1_2_padjust3, "./Results/Demo_Table_1_padjust.csv",
    row.names = FALSE)

Culture

# Percentage of infections # where there is an infection
# within 30 days after transplant and not 0 days before
cult_percent_infx_all <- peri_criteria_all %>%
    filter(bact_infection_present == "Yes", between(eday, 0,
        30)) %>%
    group_by(patientID, sampleID, eday) %>%
    arrange(-infx_stool) %>%
    dplyr::slice(1) %>%
    ungroup() %>%
    select(patientID, sampleID, bact_infection_present, infx_stool,
        organism1, micro1.factor) %>%
    distinct() %>%
    mutate(organism1 = gsub(x = organism1, pattern = "\\s+",
        replacement = ""), organism1 = str_to_lower(string = organism1),
        organism1 = ifelse(grepl(x = organism1, pattern = "enterococcus|enterobacterales|klebsiella|escherichia|citrobacter|proteus|staphyl|clostrid|pseudo|steno|bacteroides|helico"),
            organism1, "Culture Negative")) %>%
    group_by(patientID, sampleID, infx_stool) %>%
    mutate(`Enterococcus faecium` = grepl(x = organism1, pattern = "enterococcusfaecium",
        ignore.case = T), `Enterococcus faecalis` = grepl(x = organism1,
        pattern = "enterococcusfaecalis", ignore.case = T), `Enterococcus avium` = grepl(x = organism1,
        pattern = "enterococcusavium", ignore.case = T), `Enterococcus gallinarum` = grepl(x = organism1,
        pattern = "enterococcusgallinarum", ignore.case = T),
        `Klebsiella pneumoniae` = grepl(x = organism1, pattern = "klebsiellapneumoniae",
            ignore.case = T), `Enterobacter cloaceae` = grepl(x = organism1,
            pattern = "enterobactercloaceae", ignore.case = T),
        `Escherichia coli` = grepl(x = organism1, pattern = "escherichiacoli",
            ignore.case = T), `Citrobacter freundii` = grepl(x = organism1,
            pattern = "citrobacterfreundii", ignore.case = T),
        `Proteus mirabilis` = grepl(x = organism1, pattern = "proteusmirabilis",
            ignore.case = T), `Staphylococcus aureus` = grepl(x = organism1,
            pattern = "staphylococcusaureus", ignore.case = T),
        `Staphylococcus epidermis` = grepl(x = organism1, pattern = "staphylococcusepidermidis",
            ignore.case = T), `Pseudomonas aeruginosa` = grepl(x = organism1,
            pattern = "pseudomonasaeruginosa", ignore.case = T),
        `Stenotrophmonas maltophilia` = grepl(x = organism1,
            pattern = "stenotrophmonasmaltophilia", ignore.case = T),
        `Helicobacter pylori` = grepl(x = organism1, pattern = "helicobacterpylori",
            ignore.case = T), `Clostridium difficile` = grepl(x = organism1,
            pattern = "clostridiumdifficile|clostridioidesdifficile",
            ignore.case = T), `Bacteroides sp.` = grepl(x = organism1,
            pattern = "bacteroides", ignore.case = T), `Culture Negative` = grepl(x = organism1,
            pattern = "Culture Negative", ignore.case = T)) %>%
    pivot_longer(-c(patientID:micro1.factor), names_to = "organisms",
        values_to = "org_presence") %>%
    mutate(organisms = ifelse(bact_infection_present == "No",
        "No Bacterial Infection", organisms), org_presence = ifelse(org_presence ==
        TRUE, 1, 0)) %>%
    group_by(sampleID, infx_stool, bact_infection_present, organisms) %>%
    dplyr::slice_max(org_presence) %>%
    ungroup() %>%
    filter(organisms != "No Bacterial Infection") %>%
    group_by(patientID, bact_infection_present, organisms) %>%
    dplyr::slice_max(org_presence) %>%
    ungroup() %>%
    filter(org_presence == 1) %>%
    group_by(patientID, infx_stool, organisms) %>%
    dplyr::slice(1) %>%
    group_by(organisms) %>%
    tally() %>%
    ungroup() %>%
    mutate(total_infections = sum(n)) %>%
    replace_na(list(group = "unknown")) %>%
    group_by(organisms) %>%
    dplyr::summarize(percent = sum(n)/total_infections * 100,
        count = sum(n))

# Location of infections # where there is an infection
# within 30 days after transplant and not 0 days before
cult_location_infx_all <- peri_criteria_all %>%
    filter(bact_infection_present == "Yes", between(eday, 0,
        30)) %>%
    group_by(patientID, eday, key) %>%
    arrange(-infx_stool) %>%
    dplyr::slice(1) %>%
    ungroup() %>%
    select(patientID, bact_infection_present, infx_stool, organism1,
        key) %>%
    distinct() %>%
    mutate(organism1 = gsub(x = organism1, pattern = "\\s+",
        replacement = ""), organism1 = str_to_lower(string = organism1),
        organism1 = ifelse(grepl(x = organism1, pattern = "enterococcus|enterobacterales|klebsiella|escherichia|citrobacter|proteus|staphyl|clostrid|pseudo|steno|bacteroides|helico"),
            organism1, "Culture Negative")) %>%
    group_by(patientID, infx_stool) %>%
    mutate(`Enterococcus faecium` = grepl(x = organism1, pattern = "enterococcusfaecium",
        ignore.case = T), `Enterococcus faecalis` = grepl(x = organism1,
        pattern = "enterococcusfaecalis", ignore.case = T), `Enterococcus avium` = grepl(x = organism1,
        pattern = "enterococcusavium", ignore.case = T), `Enterococcus gallinarum` = grepl(x = organism1,
        pattern = "enterococcusgallinarum", ignore.case = T),
        `Klebsiella pneumoniae` = grepl(x = organism1, pattern = "klebsiellapneumoniae",
            ignore.case = T), `Enterobacter cloaceae` = grepl(x = organism1,
            pattern = "enterobactercloaceae", ignore.case = T),
        `Escherichia coli` = grepl(x = organism1, pattern = "escherichiacoli",
            ignore.case = T), `Citrobacter freundii` = grepl(x = organism1,
            pattern = "citrobacterfreundii", ignore.case = T),
        `Proteus mirabilis` = grepl(x = organism1, pattern = "proteusmirabilis",
            ignore.case = T), `Staphylococcus aureus` = grepl(x = organism1,
            pattern = "staphylococcusaureus", ignore.case = T),
        `Staphylococcus epidermis` = grepl(x = organism1, pattern = "staphylococcusepidermidis",
            ignore.case = T), `Pseudomonas aeruginosa` = grepl(x = organism1,
            pattern = "pseudomonasaeruginosa", ignore.case = T),
        `Stenotrophmonas maltophilia` = grepl(x = organism1,
            pattern = "stenotrophmonasmaltophilia", ignore.case = T),
        `Helicobacter pylori` = grepl(x = organism1, pattern = "helicobacterpylori",
            ignore.case = T), `Clostridium difficile` = grepl(x = organism1,
            pattern = "clostridiumdifficile|clostridioidesdifficile",
            ignore.case = T), `Bacteroides sp.` = grepl(x = organism1,
            pattern = "bacteroides", ignore.case = T), `Culture Negative` = grepl(x = organism1,
            pattern = "Culture Negative", ignore.case = T)) %>%
    pivot_longer(-c(patientID:key), names_to = "organisms", values_to = "org_presence") %>%
    mutate(organisms = ifelse(bact_infection_present == "No",
        "No Bacterial Infection", organisms), org_presence = ifelse(org_presence ==
        TRUE, 1, 0)) %>%
    group_by(patientID, key, infx_stool, bact_infection_present) %>%
    dplyr::slice_max(org_presence) %>%
    ungroup() %>%
    filter(org_presence == 1) %>%
    filter(organisms != "No Bacterial Infection") %>%
    group_by(patientID, infx_stool, key) %>%
    dplyr::slice(1) %>%
    group_by(key) %>%
    tally() %>%
    ungroup() %>%
    mutate(total_infections = sum(n)) %>%
    replace_na(list(group = "unknown")) %>%
    group_by(key) %>%
    dplyr::summarize(percent = sum(n)/total_infections * 100,
        count = sum(n))

# Type of Infection # where there is an infection within 30
# days after transplant and not 0 days before
cult_type_infx_all <- peri_criteria_all %>%
    filter(bact_infection_present == "Yes", grepl(variable, pattern = "anatomy.+"),
        between(eday, 0, 30)) %>%
    group_by(patientID, eday, diag_cat2) %>%
    arrange(-infx_stool) %>%
    dplyr::slice(1) %>%
    ungroup() %>%
    mutate(diag_cat3 = case_when(grepl("abdominal", diag_cat2,
        ignore.case = T) ~ "Abdominal", grepl("vir|bronchitis|covid-19|cmv",
        diag_cat2, ignore.case = T) ~ "Viral", grepl("bacteremia",
        diag_cat2, ignore.case = T) ~ "Bacteremia", grepl("thrush",
        diag_cat2, ignore.case = T) ~ "Fungal", grepl("cholangitis|empyema|panniculitis|peritonitis|latent tb",
        diag_cat2, ignore.case = T) ~ "Bacterial", grepl("cystitis|pyelonephritis",
        diag_cat2, ignore.case = T) ~ "Urinary Tract Infection",
        grepl("pneumonia", diag_cat2, ignore.case = T) ~ "Pneumonia",
        TRUE ~ diag_cat2), diag_cat3 = str_to_title(diag_cat3)) %>%
    ungroup() %>%
    select(patientID, bact_infection_present, infx_stool, organism1,
        micro1.factor, key, diag_cat3) %>%
    distinct() %>%
    mutate(organism1 = gsub(x = organism1, pattern = "\\s+",
        replacement = ""), organism1 = str_to_lower(string = organism1),
        organism1 = ifelse(grepl(x = organism1, pattern = "enterococcus|enterobacterales|klebsiella|escherichia|citrobacter|proteus|staphyl|clostrid|pseudo|steno|bacteroides|helico"),
            organism1, "Culture Negative")) %>%
    group_by(patientID, infx_stool) %>%
    mutate(`Enterococcus faecium` = grepl(x = organism1, pattern = "enterococcusfaecium",
        ignore.case = T), `Enterococcus faecalis` = grepl(x = organism1,
        pattern = "enterococcusfaecalis", ignore.case = T), `Enterococcus avium` = grepl(x = organism1,
        pattern = "enterococcusavium", ignore.case = T), `Enterococcus gallinarum` = grepl(x = organism1,
        pattern = "enterococcusgallinarum", ignore.case = T),
        `Klebsiella pneumoniae` = grepl(x = organism1, pattern = "klebsiellapneumoniae",
            ignore.case = T), `Enterobacter cloaceae` = grepl(x = organism1,
            pattern = "enterobactercloaceae", ignore.case = T),
        `Escherichia coli` = grepl(x = organism1, pattern = "escherichiacoli",
            ignore.case = T), `Citrobacter freundii` = grepl(x = organism1,
            pattern = "citrobacterfreundii", ignore.case = T),
        `Proteus mirabilis` = grepl(x = organism1, pattern = "proteusmirabilis",
            ignore.case = T), `Staphylococcus aureus` = grepl(x = organism1,
            pattern = "staphylococcusaureus", ignore.case = T),
        `Staphylococcus epidermis` = grepl(x = organism1, pattern = "staphylococcusepidermidis",
            ignore.case = T), `Pseudomonas aeruginosa` = grepl(x = organism1,
            pattern = "pseudomonasaeruginosa", ignore.case = T),
        `Stenotrophmonas maltophilia` = grepl(x = organism1,
            pattern = "stenotrophmonasmaltophilia", ignore.case = T),
        `Helicobacter pylori` = grepl(x = organism1, pattern = "helicobacterpylori",
            ignore.case = T), `Clostridium difficile` = grepl(x = organism1,
            pattern = "clostridiumdifficile|clostridioidesdifficile",
            ignore.case = T), `Bacteroides sp.` = grepl(x = organism1,
            pattern = "bacteroides", ignore.case = T), `Culture Negative` = grepl(x = organism1,
            pattern = "Culture Negative", ignore.case = T)) %>%
    pivot_longer(-c(patientID:diag_cat3), names_to = "organisms",
        values_to = "org_presence") %>%
    mutate(organisms = ifelse(bact_infection_present == "No",
        "No Bacterial Infection", organisms), org_presence = ifelse(org_presence ==
        TRUE, 1, 0)) %>%
    group_by(patientID, diag_cat3, infx_stool, bact_infection_present) %>%
    dplyr::slice_max(org_presence) %>%
    ungroup() %>%
    filter(org_presence == 1, organisms != "No Bacterial Infection") %>%
    ungroup() %>%
    group_by(patientID, infx_stool, diag_cat3) %>%
    dplyr::slice(1) %>%
    group_by(diag_cat3) %>%
    tally() %>%
    ungroup() %>%
    mutate(total_infections = sum(n)) %>%
    group_by(diag_cat3) %>%
    dplyr::summarize(percent = sum(n)/total_infections * 100,
        count = sum(n))

culture_tot <- bind_rows(cult_percent_infx_all, cult_location_infx_all,
    cult_type_infx_all) %>%
    mutate(percent = round(percent, 2))

# Read in csv to then append adjusted pvalues
write.csv(culture_tot, "./Results/Culture_Table_1.csv", row.names = FALSE)

Antibiotics

## Vector of variables to summarize
abx_vars <- c("Basiliximab", "Mycophenolate", "Steroid", "Systemic Vancomycin",
    "Tacrolimus", "Cefepime", "Metronidazole", "Piperacillin/Tazobactam",
    "Rifaximin", "Ceftriaxone", "Ciprofloxacin", "Gentamicin",
    "Tobramicin", "Daptomycin", "Meropenem", "Oral Vancomycin")

abx2 <- abx %>%
    filter(grepl(pattern = "basilix|tacro|steroid|mycophenolate|lactulose",
        medication_name, ignore.case = T) | grepl(pattern = "GLUCOCORTICOIDS|steroid",
        med_pharm_class, ignore.case = T) | grepl(pattern = "steroid",
        med_pharm_sub_class, ignore.case = T) | grepl("given",
        mar_action, ignore.case = T)) %>%
    mutate(Immunosuppressants = case_when(grepl("basilix", medication_name,
        ignore.case = T) ~ "Basiliximab", grepl("tacro", medication_name,
        ignore.case = T) ~ "Tacrolimus", grepl("one|ide|solu-cortef",
        medication_name, ignore.case = T) ~ "Steroid", grepl("mycophenolate",
        medication_name, ignore.case = T) ~ "Mycophenolate",
        grepl("lactulose", medication_name, ignore.case = T) ~
            "Lactulose"), Antibiotics = case_when(grepl("rifaximin",
        medication_name, ignore.case = T) ~ "Rifaximin", grepl("lactulose",
        medication_name, ignore.case = T) ~ "Lactulose", grepl("ceftriaxone",
        medication_name, ignore.case = T) ~ "Ceftriaxone", grepl("piperacillin|tazobactam",
        medication_name, ignore.case = T) ~ "Piperacillin/Tazobactam",
        grepl("cefepime", medication_name, ignore.case = T) ~
            "Cefepime", grepl("meropenem", medication_name, ignore.case = T) ~
            "Meropenem", grepl("gentamicin", medication_name,
            ignore.case = T) ~ "Gentamicin", grepl("tobramycin",
            medication_name, ignore.case = T) ~ "Tobramicin",
        grepl("vancomycin.+oral", medication_name, ignore.case = T) ~
            "Oral Vancomycin", grepl("vancomycin.+(IV|Intravenous)",
            medication_name, ignore.case = T) ~ "Systemic Vancomycin",
        grepl("METRONIDAZOLE", medication_name, ignore.case = T) ~
            "Metronidazole", grepl("DAPTOMYCIN", medication_name,
            ignore.case = T) ~ "Daptomycin", grepl("linezolid",
            medication_name, ignore.case = T) ~ "Linezolid",
        grepl("fluconazole", medication_name, ignore.case = T) ~
            "Fluconazole", grepl("micafungin", medication_name,
            ignore.case = T) ~ "Micafungin", grepl("cipro", medication_name,
            ignore.case = T) & !grepl("drop", dose_units, ignore.case = T) ~
            "Ciprofloxacin"), action = case_when(!is.na(Immunosuppressants) &
        between(days_transplant, 0, 30) | !is.na(Immunosuppressants) &
        ordering_mode == "Outpatient" ~ "keep", !is.na(Antibiotics) &
        between(days_transplant, -7, 1) ~ "keep", TRUE ~ "remove")) %>%
    group_by(patientID, Immunosuppressants, Antibiotics) %>%
    arrange(days_transplant) %>%
    filter(action == "keep") %>%
    dplyr::slice(1) %>%
    select(patientID, Immunosuppressants, Antibiotics) %>%
    left_join(peri_matrix_all %>%
        select(patientID, any_infection)) %>%
    pivot_longer(!c(patientID, any_infection), names_to = "variable",
        values_to = "value") %>%
    drop_na(value) %>%
    mutate(variable = 1) %>%
    pivot_wider(c(patientID, any_infection), names_from = "value",
        values_from = "variable", values_fn = min) %>%
    replace(is.na(.), 0)

abx_tab1_1 <- CreateTableOne(vars = abx_vars, testNonNormal = "wilcox.test",
    includeNA = FALSE, factorVars = abx_vars, strata = "any_infection",
    data = abx2)

summary(abx_tab1_1)
## 
##      ### Summary of categorical variables ### 
## 
## any_infection: 0
##                      var  n miss p.miss level freq percent cum.percent
##              Basiliximab 82    0    0.0     0   28    34.1        34.1
##                                             1   54    65.9       100.0
##                                                                       
##            Mycophenolate 82    0    0.0     0   10    12.2        12.2
##                                             1   72    87.8       100.0
##                                                                       
##                  Steroid 82    0    0.0     1   82   100.0       100.0
##                                                                       
##      Systemic Vancomycin 82    0    0.0     0   40    48.8        48.8
##                                             1   42    51.2       100.0
##                                                                       
##               Tacrolimus 82    0    0.0     1   82   100.0       100.0
##                                                                       
##                 Cefepime 82    0    0.0     0   56    68.3        68.3
##                                             1   26    31.7       100.0
##                                                                       
##            Metronidazole 82    0    0.0     0   48    58.5        58.5
##                                             1   34    41.5       100.0
##                                                                       
##  Piperacillin/Tazobactam 82    0    0.0     0   12    14.6        14.6
##                                             1   70    85.4       100.0
##                                                                       
##                Rifaximin 82    0    0.0     0   45    54.9        54.9
##                                             1   37    45.1       100.0
##                                                                       
##              Ceftriaxone 82    0    0.0     0   66    80.5        80.5
##                                             1   16    19.5       100.0
##                                                                       
##            Ciprofloxacin 82    0    0.0     0   67    81.7        81.7
##                                             1   15    18.3       100.0
##                                                                       
##               Gentamicin 82    0    0.0     0   81    98.8        98.8
##                                             1    1     1.2       100.0
##                                                                       
##               Tobramicin 82    0    0.0     0   78    95.1        95.1
##                                             1    4     4.9       100.0
##                                                                       
##               Daptomycin 82    0    0.0     0   81    98.8        98.8
##                                             1    1     1.2       100.0
##                                                                       
##                Meropenem 82    0    0.0     0   78    95.1        95.1
##                                             1    4     4.9       100.0
##                                                                       
##          Oral Vancomycin 82    0    0.0     0   81    98.8        98.8
##                                             1    1     1.2       100.0
##                                                                       
## ------------------------------------------------------------ 
## any_infection: 1
##                      var  n miss p.miss level freq percent cum.percent
##              Basiliximab 25    0    0.0     0    7    28.0        28.0
##                                             1   18    72.0       100.0
##                                                                       
##            Mycophenolate 25    0    0.0     0    1     4.0         4.0
##                                             1   24    96.0       100.0
##                                                                       
##                  Steroid 25    0    0.0     1   25   100.0       100.0
##                                                                       
##      Systemic Vancomycin 25    0    0.0     0    8    32.0        32.0
##                                             1   17    68.0       100.0
##                                                                       
##               Tacrolimus 25    0    0.0     1   25   100.0       100.0
##                                                                       
##                 Cefepime 25    0    0.0     0   14    56.0        56.0
##                                             1   11    44.0       100.0
##                                                                       
##            Metronidazole 25    0    0.0     0   13    52.0        52.0
##                                             1   12    48.0       100.0
##                                                                       
##  Piperacillin/Tazobactam 25    0    0.0     0    2     8.0         8.0
##                                             1   23    92.0       100.0
##                                                                       
##                Rifaximin 25    0    0.0     0   13    52.0        52.0
##                                             1   12    48.0       100.0
##                                                                       
##              Ceftriaxone 25    0    0.0     0   18    72.0        72.0
##                                             1    7    28.0       100.0
##                                                                       
##            Ciprofloxacin 25    0    0.0     0   21    84.0        84.0
##                                             1    4    16.0       100.0
##                                                                       
##               Gentamicin 25    0    0.0     0   24    96.0        96.0
##                                             1    1     4.0       100.0
##                                                                       
##               Tobramicin 25    0    0.0     0   22    88.0        88.0
##                                             1    3    12.0       100.0
##                                                                       
##               Daptomycin 25    0    0.0     0   22    88.0        88.0
##                                             1    3    12.0       100.0
##                                                                       
##                Meropenem 25    0    0.0     0   24    96.0        96.0
##                                             1    1     4.0       100.0
##                                                                       
##          Oral Vancomycin 25    0    0.0     0   24    96.0        96.0
##                                             1    1     4.0       100.0
##                                                                       
## 
## p-values
##                            pApprox     pExact
## Basiliximab             0.74143512 0.63341955
## Mycophenolate           0.42082759 0.45169519
## Steroid                         NA         NA
## Systemic Vancomycin     0.21234695 0.17136371
## Tacrolimus                      NA         NA
## Cefepime                0.37287647 0.33702201
## Metronidazole           0.72844870 0.64658181
## Piperacillin/Tazobactam 0.60142514 0.51269562
## Rifaximin               0.98119534 0.82233980
## Ceftriaxone             0.53110302 0.40817712
## Ciprofloxacin           1.00000000 1.00000000
## Gentamicin              0.95599591 0.41438900
## Tobramicin              0.42443880 0.35037337
## Daptomycin              0.05938894 0.03899733
## Meropenem               1.00000000 1.00000000
## Oral Vancomycin         0.95599591 0.41438900
## 
## Standardize mean differences
##                             1 vs 2
## Basiliximab             0.13310348
## Mycophenolate           0.30385759
## Steroid                 0.00000000
## Systemic Vancomycin     0.34709743
## Tacrolimus              0.00000000
## Cefepime                0.25550557
## Metronidazole           0.13174842
## Piperacillin/Tazobactam 0.21056770
## Rifaximin               0.05772164
## Ceftriaxone             0.20043582
## Ciprofloxacin           0.06085600
## Gentamicin              0.17507370
## Tobramicin              0.25833951
## Daptomycin              0.44449214
## Meropenem               0.04264158
## Oral Vancomycin         0.17507370
abx_tab1_2 <- print(abx_tab1_1, formatOptions = list(big.mark = ","))
##                                  Stratified by any_infection
##                                   0           1           p      test
##   n                               82          25                     
##   Basiliximab = 1 (%)             54 ( 65.9)  18 ( 72.0)   0.741     
##   Mycophenolate = 1 (%)           72 ( 87.8)  24 ( 96.0)   0.421     
##   Steroid = 1 (%)                 82 (100.0)  25 (100.0)      NA     
##   Systemic Vancomycin = 1 (%)     42 ( 51.2)  17 ( 68.0)   0.212     
##   Tacrolimus = 1 (%)              82 (100.0)  25 (100.0)      NA     
##   Cefepime = 1 (%)                26 ( 31.7)  11 ( 44.0)   0.373     
##   Metronidazole = 1 (%)           34 ( 41.5)  12 ( 48.0)   0.728     
##   Piperacillin/Tazobactam = 1 (%) 70 ( 85.4)  23 ( 92.0)   0.601     
##   Rifaximin = 1 (%)               37 ( 45.1)  12 ( 48.0)   0.981     
##   Ceftriaxone = 1 (%)             16 ( 19.5)   7 ( 28.0)   0.531     
##   Ciprofloxacin = 1 (%)           15 ( 18.3)   4 ( 16.0)   1.000     
##   Gentamicin = 1 (%)               1 (  1.2)   1 (  4.0)   0.956     
##   Tobramicin = 1 (%)               4 (  4.9)   3 ( 12.0)   0.424     
##   Daptomycin = 1 (%)               1 (  1.2)   3 ( 12.0)   0.059     
##   Meropenem = 1 (%)                4 (  4.9)   1 (  4.0)   1.000     
##   Oral Vancomycin = 1 (%)          1 (  1.2)   1 (  4.0)   0.956
write.csv(abx_tab1_2, "./Results/ABX_Table_1.csv", row.names = TRUE)  # Saving then reading in the same data allows for an easy way to adjust p-values, since it loads the object as a dataframe

# Need to adjust pvalues and arrange properly....hence the
# multiple dataframes below
abx_tab1_2_padjust1 <- read.csv("./Results/ABX_Table_1.csv") %>%
    dplyr::rename(` ` = X, `No Infection` = X0, `Bacterial Infection` = X1)

abx_tab1_2_padjust2 <- abx_tab1_2_padjust1 %>%
    mutate(` ` = factor(` `, levels = abx_tab1_2_padjust1$` `))

abx_tab1_2_padjust3 <- abx_tab1_2_padjust1 %>%
    mutate(test = ifelse(!is.na(p) & is.na(test), "chi.sq", "")) %>%
    group_by(test) %>%
    rstatix::adjust_pvalue(p.col = "p", method = "BH") %>%
    ungroup() %>%
    mutate(` ` = factor(` `, abx_tab1_2_padjust2$` `)) %>%
    arrange(` `) %>%
    mutate(p = ifelse(is.na(p), "", p), p.adj = ifelse(is.na(p.adj),
        "", p.adj))

# Read in csv to then append adjusted pvalues
write.csv(abx_tab1_2_padjust3, "./Results/ABX_Table_1_padjust.csv",
    row.names = FALSE)

Miscellaneous Statistics

# Expansions and Infections Stats
expan_infx_stats <-
peri_matrix_all %>% 
  left_join(peri_matrix_all %>% select(sampleID, ends_with("infection")) %>% ungroup()) %>% 
  mutate(ecoc_infx = enterococcus_infection,
         ecoc_infx = ifelse(ecoc_infx >= 1, 1, 0),
         ebac_infx = enterobacterales_infection,
         ebac_infx = ifelse(ebac_infx >= 1, 1, 0)) %>% 
  select(enterococcus_rel_abundance, enterobacterales_rel_abundance, ecoc_infx, ebac_infx) %>% 
  summarise(ecoc_expan_above_20_pct = sum(enterococcus_rel_abundance >= 0.2),
            ecoc_expan_above_cutpoint = sum(enterococcus_rel_abundance >= optimal_cutpoint_rel$optimal_cutpoint[2]),
            ecoc_infx_ecoc_below_cutpoint = sum(ecoc_infx == 1 & enterococcus_rel_abundance < optimal_cutpoint_rel$optimal_cutpoint[2], na.rm=T),
            ecoc_infx_ecoc_above_cutpoint = sum(ecoc_infx == 1 & enterococcus_rel_abundance >= optimal_cutpoint_rel$optimal_cutpoint[2], na.rm=T),
            
            ebac_expan_above_5_pct = sum(enterobacterales_rel_abundance >= 0.05),
            ebac_expan_above_cutpoint = sum(enterobacterales_rel_abundance >= optimal_cutpoint_rel$optimal_cutpoint[1]),
            ebac_infx_ebac_below_cutpoint = sum(ebac_infx == 1 & enterobacterales_rel_abundance < optimal_cutpoint_rel$optimal_cutpoint[1], na.rm=T),
            ebac_infx_ebac_above_cutpoint = sum(ebac_infx == 1 & enterobacterales_rel_abundance >= optimal_cutpoint_rel$optimal_cutpoint[1], na.rm=T)) %>%
  rownames_to_column(var = "rowname") %>% 
  pivot_longer(-rowname, names_to = "metric", values_to = "count") %>% 
  mutate(percent = round((count / 107) * 100, 2)) %>% 
  select(-rowname)

expan_infx_stats
## # A tibble: 8 × 3
##   metric                        count percent
##   <chr>                         <int>   <dbl>
## 1 ecoc_expan_above_20_pct          43   40.2 
## 2 ecoc_expan_above_cutpoint        44   41.1 
## 3 ecoc_infx_ecoc_below_cutpoint     2    1.87
## 4 ecoc_infx_ecoc_above_cutpoint    15   14.0 
## 5 ebac_expan_above_5_pct           18   16.8 
## 6 ebac_expan_above_cutpoint        29   27.1 
## 7 ebac_infx_ebac_below_cutpoint     0    0   
## 8 ebac_infx_ebac_above_cutpoint     8    7.48
write.csv(expan_infx_stats, "./Results/Appendix_Table_1.csv", row.names = FALSE)

Save Data Image

save.image(file = "./Data/LT_Modeling.RData")